From e09151dcddfa440bb0a6c79ced1591dd8aed3e0a Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Mon, 24 Apr 2023 02:05:49 +0200
Subject: [PATCH 1/4] fix #149 - load tensors by type, ignoring filetype

---
 ggml-loader/Cargo.toml        |   2 +-
 ggml-loader/src/lib.rs        |  28 ++--
 ggml-loader/src/util.rs       |  29 +---
 ggml/src/lib.rs               |  23 ++++
 llama-cli/src/cli_args.rs     |  18 +--
 llama-cli/src/main.rs         |   2 +-
 llama-rs/src/convert.rs       |  19 +--
 llama-rs/src/lib.rs           |   2 +-
 llama-rs/src/loader.rs        |  17 ++-
 llama-rs/src/loader2.rs       | 252 ++++++++++++++--------------------
 llama-rs/src/loader_common.rs |  72 +++++++++-
 llama-rs/src/model.rs         | 173 +++++++++++++++++++++--
 12 files changed, 406 insertions(+), 231 deletions(-)

diff --git a/ggml-loader/Cargo.toml b/ggml-loader/Cargo.toml
index ab711363..2d088758 100644
--- a/ggml-loader/Cargo.toml
+++ b/ggml-loader/Cargo.toml
@@ -7,4 +7,4 @@ edition = "2021"
 
 [dependencies]
 ggml = { path = "../ggml" }
-thiserror = "*"
+thiserror = "1.0"
diff --git a/ggml-loader/src/lib.rs b/ggml-loader/src/lib.rs
index 7d29d4b3..47239f08 100644
--- a/ggml-loader/src/lib.rs
+++ b/ggml-loader/src/lib.rs
@@ -10,7 +10,7 @@ use util::*;
 
 pub type ElementType = ggml::Type;
 
-/// file type containing the model
+/// the format of the file containing the model
 #[derive(Debug, PartialEq, Clone, Copy)]
 #[allow(clippy::upper_case_acronyms)]
 pub enum ContainerType {
@@ -21,7 +21,6 @@ pub enum ContainerType {
     /// mmap-able format
     GGJT,
 }
-
 impl ContainerType {
     pub fn support_mmap(&self) -> bool {
         match self {
@@ -64,10 +63,19 @@ pub struct TensorInfo {
     pub n_dims: usize,
     pub dims: [usize; 2],
     pub n_elements: usize,
-    pub ftype: ElementType,
+    pub element_type: ElementType,
     /// start of tensor - start of file
     pub start_offset: u64,
 }
+impl TensorInfo {
+    pub fn calc_size(&self) -> usize {
+        let mut size = ggml::type_size(self.element_type);
+        for &dim in &self.dims[0..self.n_dims] {
+            size *= dim;
+        }
+        size / ggml::blck_size(self.element_type)
+    }
+}
 
 /// Info in hyperparameter used for later loading tasks. Used in callback.
 /// see [`LoadHandler::load_hyper_parameters`]
@@ -78,10 +86,7 @@ pub struct PartialHyperparameters {
 
 pub enum TensorDataTreatment<'a> {
     CopyInto(&'a mut [u8]),
-    SeekPast {
-        /// should be `tensor.nbytes`
-        n_bytes: usize,
-    },
+    Skip,
 }
 
 #[allow(unused_variables)]
@@ -173,7 +178,9 @@ pub fn load_weights<T, R: BufRead + Seek>(
         // load tensor header
         let n_dims: usize = read_i32(reader)?.try_into()?;
         let name_len = read_i32(reader)?;
-        let ftype = decode_element_type_res(read_i32(reader)?)?;
+        let ftype = read_i32(reader)?;
+        let ftype =
+            ggml::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType(ftype))?;
 
         let mut n_elements: usize = 1;
         let mut dims = [1usize, 1];
@@ -214,9 +221,10 @@ pub fn load_weights<T, R: BufRead + Seek>(
             dims,
             n_dims,
             n_elements,
-            ftype,
+            element_type: ftype,
             start_offset: offset_aligned,
         };
+        let n_bytes = tensor_info.calc_size();
 
         match controlflow_to_result(handler.tensor_buffer(tensor_info))? {
             TensorDataTreatment::CopyInto(buf) => {
@@ -225,7 +233,7 @@ pub fn load_weights<T, R: BufRead + Seek>(
                 }
                 reader.read_exact(buf)?;
             }
-            TensorDataTreatment::SeekPast { n_bytes } => {
+            TensorDataTreatment::Skip => {
                 // skip if no buffer is given
                 reader.seek(SeekFrom::Start(offset_aligned + n_bytes as u64))?;
             }
diff --git a/ggml-loader/src/util.rs b/ggml-loader/src/util.rs
index 33374fd6..9a759aac 100644
--- a/ggml-loader/src/util.rs
+++ b/ggml-loader/src/util.rs
@@ -1,7 +1,7 @@
 pub use std::io::{BufRead, Seek, SeekFrom};
 use std::ops::ControlFlow;
 
-use crate::{ElementType, LoadError};
+use crate::LoadError;
 
 pub fn read_bytes<const N: usize>(reader: &mut impl BufRead) -> Result<[u8; N], std::io::Error> {
     let mut bytes = [0u8; N];
@@ -35,33 +35,6 @@ pub fn has_data_left(reader: &mut impl BufRead) -> Result<bool, std::io::Error>
     reader.fill_buf().map(|b| !b.is_empty())
 }
 
-pub fn decode_element_type(ftype: i32) -> Option<ElementType> {
-    match ftype {
-        0 => Some(ggml::Type::F32),
-        1 => Some(ggml::Type::F16),
-        2 => Some(ggml::Type::Q4_0),
-        3 => Some(ggml::Type::Q4_1),
-        _ => None,
-    }
-}
-
-pub fn encode_element_type(element_type: ElementType) -> Option<i32> {
-    match element_type {
-        ggml::Type::F32 => Some(0),
-        ggml::Type::F16 => Some(1),
-        ggml::Type::Q4_0 => Some(2),
-        ggml::Type::Q4_1 => Some(3),
-        _ => None,
-    }
-}
-
-pub fn decode_element_type_res<T>(ftype: i32) -> Result<ElementType, LoadError<T>> {
-    match decode_element_type(ftype) {
-        Some(x) => Ok(x),
-        None => Err(LoadError::UnsupportedElementType(ftype)),
-    }
-}
-
 pub fn controlflow_to_result<A, B>(x: ControlFlow<A, B>) -> Result<B, LoadError<A>> {
     match x {
         ControlFlow::Continue(x) => Ok(x),
diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
index 11d4246b..37188a64 100644
--- a/ggml/src/lib.rs
+++ b/ggml/src/lib.rs
@@ -24,6 +24,9 @@ pub const FILE_MAGIC_UNVERSIONED: u32 = 0x67676d6c;
 /// The currently-supported format version for `ggml` files.
 pub const FORMAT_VERSION: u32 = 1;
 
+/// The size of a `ggml` object.
+pub const OBJECT_SIZE: usize = ggml_sys::GGML_OBJECT_SIZE;
+
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
 /// The type of a value in `ggml`.
 pub enum Type {
@@ -32,6 +35,12 @@ pub enum Type {
     Q4_0,
     /// Quantized 4-bit (type 1); used by GPTQ.
     Q4_1,
+    /// Quantized 4-bit (type 2).
+    Q4_2,
+    /// Quantized 4-bit (type 3).
+    Q4_3,
+    /// Quantized 8-bit (type 0).
+    Q8_0,
     /// Integer 32-bit.
     I32,
     /// Float 16-bit.
@@ -44,6 +53,9 @@ impl From<Type> for ggml_sys::ggml_type {
         match t {
             Type::Q4_0 => ggml_sys::ggml_type_GGML_TYPE_Q4_0,
             Type::Q4_1 => ggml_sys::ggml_type_GGML_TYPE_Q4_1,
+            Type::Q4_2 => ggml_sys::ggml_type_GGML_TYPE_Q4_2,
+            Type::Q4_3 => ggml_sys::ggml_type_GGML_TYPE_Q4_3,
+            Type::Q8_0 => ggml_sys::ggml_type_GGML_TYPE_Q8_0,
             Type::I32 => ggml_sys::ggml_type_GGML_TYPE_I32,
             Type::F16 => ggml_sys::ggml_type_GGML_TYPE_F16,
             Type::F32 => ggml_sys::ggml_type_GGML_TYPE_F32,
@@ -56,6 +68,9 @@ impl TryFrom<ggml_sys::ggml_type> for Type {
         match t {
             ggml_sys::ggml_type_GGML_TYPE_Q4_0 => Ok(Type::Q4_0),
             ggml_sys::ggml_type_GGML_TYPE_Q4_1 => Ok(Type::Q4_1),
+            ggml_sys::ggml_type_GGML_TYPE_Q4_2 => Ok(Type::Q4_2),
+            ggml_sys::ggml_type_GGML_TYPE_Q4_3 => Ok(Type::Q4_3),
+            ggml_sys::ggml_type_GGML_TYPE_Q8_0 => Ok(Type::Q8_0),
             ggml_sys::ggml_type_GGML_TYPE_I32 => Ok(Type::I32),
             ggml_sys::ggml_type_GGML_TYPE_F16 => Ok(Type::F16),
             ggml_sys::ggml_type_GGML_TYPE_F32 => Ok(Type::F32),
@@ -68,6 +83,9 @@ impl std::fmt::Display for Type {
         match self {
             Type::Q4_0 => write!(f, "q4_0"),
             Type::Q4_1 => write!(f, "q4_1"),
+            Type::Q4_2 => write!(f, "q4_2"),
+            Type::Q4_3 => write!(f, "q4_3"),
+            Type::Q8_0 => write!(f, "q8_0"),
             Type::I32 => write!(f, "i32"),
             Type::F16 => write!(f, "f16"),
             Type::F32 => write!(f, "f32"),
@@ -510,6 +528,11 @@ pub struct Tensor {
 }
 
 impl Tensor {
+    /// Size of the `ggml_tensor` struct in bytes.
+    ///
+    /// Exposed for purposes of determining context size.
+    pub const C_TYPE_SIZE: usize = std::mem::size_of::<ggml_sys::ggml_tensor>();
+
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
         Tensor {
diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs
index e31d4f48..fc064017 100644
--- a/llama-cli/src/cli_args.rs
+++ b/llama-cli/src/cli_args.rs
@@ -373,12 +373,12 @@ pub struct Convert {
     pub directory: PathBuf,
 
     /// File type to convert to
-    #[arg(long, short = 't', value_enum, default_value_t = ElementType::Q4_0)]
-    pub element_type: ElementType,
+    #[arg(long, short = 't', value_enum, default_value_t = FileType::Q4_0)]
+    pub file_type: FileType,
 }
 
 #[derive(Parser, Debug, ValueEnum, Clone, Copy)]
-pub enum ElementType {
+pub enum FileType {
     /// Quantized 4-bit (type 0).
     Q4_0,
     /// Quantized 4-bit (type 1); used by GPTQ.
@@ -388,13 +388,13 @@ pub enum ElementType {
     /// Float 32-bit.
     F32,
 }
-impl From<ElementType> for llama_rs::ElementType {
-    fn from(t: ElementType) -> Self {
+impl From<FileType> for llama_rs::FileType {
+    fn from(t: FileType) -> Self {
         match t {
-            ElementType::Q4_0 => llama_rs::ElementType::Q4_0,
-            ElementType::Q4_1 => llama_rs::ElementType::Q4_1,
-            ElementType::F16 => llama_rs::ElementType::F16,
-            ElementType::F32 => llama_rs::ElementType::F32,
+            FileType::Q4_0 => llama_rs::FileType::MostlyQ4_0,
+            FileType::Q4_1 => llama_rs::FileType::MostlyQ4_1,
+            FileType::F16 => llama_rs::FileType::MostlyF16,
+            FileType::F32 => llama_rs::FileType::F32,
         }
     }
 }
diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs
index e0f5ced3..de8323b0 100644
--- a/llama-cli/src/main.rs
+++ b/llama-cli/src/main.rs
@@ -22,7 +22,7 @@ fn main() -> Result<()> {
         Args::DumpTokens(args) => dump_tokens(&args)?,
         Args::Repl(args) => interactive(&args, false)?,
         Args::ChatExperimental(args) => interactive(&args, true)?,
-        Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.element_type.into()),
+        Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.file_type.into()),
     }
 
     Ok(())
diff --git a/llama-rs/src/convert.rs b/llama-rs/src/convert.rs
index 67557b8f..07c4939c 100644
--- a/llama-rs/src/convert.rs
+++ b/llama-rs/src/convert.rs
@@ -16,20 +16,19 @@ use std::{
     vec,
 };
 
-use crate::{util, Hyperparameters, Vocabulary};
-use ggml_loader::util::encode_element_type;
+use crate::{loader_common::FileType, util, Hyperparameters, Vocabulary};
 
 /// Converts a `pth` file to a `ggml` file.
-pub fn convert_pth_to_ggml(model_directory: &Path, element_type: ggml::Type) {
+pub fn convert_pth_to_ggml(model_directory: &Path, file_type: FileType) {
     let tokenizer_path = model_directory.parent().unwrap().join("tokenizer.model");
     let vocab = load_vocabulary(tokenizer_path.as_path());
 
-    let hparams = load_hyperparameters(model_directory, element_type, &vocab);
+    let hparams = load_hyperparameters(model_directory, file_type, &vocab);
 
     let model_files = util::find_all_model_files(model_directory).unwrap();
 
     for (i, _file) in model_files.iter().enumerate() {
-        let fname_out = model_directory.join(format!("rust-model-{element_type}.bin"));
+        let fname_out = model_directory.join(format!("rust-model-{file_type}.bin"));
         let mut file = File::create(fname_out).expect("Unable to create file");
         write_header(file.borrow_mut(), &hparams).unwrap();
         write_tokens(file.borrow_mut(), &vocab).unwrap();
@@ -66,11 +65,7 @@ fn load_vocabulary(path: &Path) -> Vocabulary {
     }
 }
 
-fn load_hyperparameters(
-    path: &Path,
-    element_type: ggml::Type,
-    vocab: &Vocabulary,
-) -> Hyperparameters {
+fn load_hyperparameters(path: &Path, file_type: FileType, vocab: &Vocabulary) -> Hyperparameters {
     #[derive(Deserialize)]
     struct HyperParametersJson {
         dim: usize,
@@ -83,7 +78,7 @@ fn load_hyperparameters(
     let json = read_to_string(path.join("params.json")).expect("Unable to read file");
     let json: HyperParametersJson = serde_json::from_str(&json).expect("Unable to parse json");
     Hyperparameters {
-        element_type,
+        file_type,
         n_ctx: 0,
         n_embd: json.dim,
         n_head: json.n_heads,
@@ -107,7 +102,7 @@ fn write_header(fout: &mut File, hparams: &Hyperparameters) -> Result<(), String
         i32::try_from(hparams.n_head).unwrap(),
         i32::try_from(hparams.n_layer).unwrap(),
         i32::try_from(hparams.n_embd / hparams.n_head).unwrap(),
-        encode_element_type(hparams.element_type).unwrap(),
+        hparams.file_type.into(),
     ];
     let mut packed_values: Vec<u8> = vec![];
 
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index 3f0a6c69..88d26d0c 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -19,7 +19,7 @@ pub use inference_session::{
     InferenceSession, InferenceSessionParameters, InferenceSnapshot, ModelKVMemoryType,
     SnapshotError,
 };
-pub use loader_common::{LoadError, LoadProgress};
+pub use loader_common::{FileType, LoadError, LoadProgress};
 pub use model::{Hyperparameters, Model};
 pub use util::TokenUtf8Buffer;
 pub use vocabulary::{TokenBias, TokenId, Vocabulary};
diff --git a/llama-rs/src/loader.rs b/llama-rs/src/loader.rs
index 9ef545e5..8b92378e 100644
--- a/llama-rs/src/loader.rs
+++ b/llama-rs/src/loader.rs
@@ -7,6 +7,7 @@ use std::{
 };
 
 use crate::{
+    loader_common::FileType,
     util::{self, mulf},
     LoadError, LoadProgress, Model, TokenId, Vocabulary,
 };
@@ -69,9 +70,9 @@ pub(crate) fn load(
         n_head: read_i32(&mut reader)?.try_into()?,
         n_layer: read_i32(&mut reader)?.try_into()?,
         n_rot: read_i32(&mut reader)?.try_into()?,
-        element_type: {
+        file_type: {
             let ftype = read_i32(&mut reader)?;
-            decode_element_type(ftype).ok_or_else(|| LoadError::UnsupportedElementType(ftype))
+            FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))
         }?,
     };
 
@@ -108,7 +109,13 @@ pub(crate) fn load(
     // for the big tensors, we have the option to store the data in 16-bit
     // floats or quantized in order to save memory and also to speed up the
     // computation
-    let wtype = hparams.element_type;
+    let wtype = match hparams.file_type {
+        FileType::F32 => ggml::Type::F32,
+        FileType::MostlyF16 => ggml::Type::F16,
+        FileType::MostlyQ4_0 => ggml::Type::Q4_0,
+        FileType::MostlyQ4_1 => ggml::Type::Q4_1,
+        _ => unimplemented!(),
+    };
 
     let n_embd = hparams.n_embd;
     let n_layer = hparams.n_layer;
@@ -159,7 +166,7 @@ pub(crate) fn load(
         (None, None)
     };
 
-    let mut model = Model::new(context, hparams, vocabulary, n_ff, wtype, model_type, mmap);
+    let mut model = Model::new_loader1(context, hparams, vocabulary, n_ff, wtype, mmap);
     match model_type {
         ContainerType::GGMF | ContainerType::GGML => {
             let file_offset = reader.stream_position()?;
@@ -421,7 +428,7 @@ fn load_tensor_header_ggmf<'a>(
 }
 
 fn tensor_type_size(ftype: i32, ne: [i64; 2]) -> Option<usize> {
-    let ftype = decode_element_type(ftype)?;
+    let ftype = ggml::Type::try_from(ftype).ok()?;
     match ftype {
         ElementType::Q4_0 | ElementType::Q4_1 => {
             assert_eq!(ne[0] % 64, 0);
diff --git a/llama-rs/src/loader2.rs b/llama-rs/src/loader2.rs
index ff84e3b1..38413df0 100644
--- a/llama-rs/src/loader2.rs
+++ b/llama-rs/src/loader2.rs
@@ -3,6 +3,7 @@ use ggml_loader::*;
 use memmap2::Mmap;
 
 use std::{
+    collections::HashMap,
     fs::File,
     io::{BufRead, BufReader, Seek},
     ops::ControlFlow,
@@ -10,8 +11,8 @@ use std::{
 };
 
 use crate::{
-    util::{self, mulf},
-    Hyperparameters, LoadError, LoadProgress, Model, TokenId, Vocabulary,
+    loader_common::FileType, util, Hyperparameters, LoadError, LoadProgress, Model, TokenId,
+    Vocabulary,
 };
 
 impl LoadError {
@@ -38,7 +39,7 @@ pub(crate) fn load(
     path: impl AsRef<Path>,
     prefer_mmap: bool,
     n_context_tokens: usize,
-    load_progress_callback: impl FnMut(LoadProgress),
+    mut load_progress_callback: impl FnMut(LoadProgress),
 ) -> Result<Model, LoadError> {
     let main_path = path.as_ref();
 
@@ -47,45 +48,114 @@ pub(crate) fn load(
         return Err(LoadError::MultipartNotSupported { paths });
     }
 
-    let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
+    let mut file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
         source: e,
         path: main_path.to_owned(),
     })?;
     let mut reader = BufReader::new(&file);
 
     let path = path.as_ref().to_owned();
-    let mut loader = Loader {
-        path: path.clone(),
-        vocab: Default::default(),
-        model: None,
-        n_ctx: n_context_tokens,
-        load_progress_callback,
-        prefer_mmap,
 
-        tensor_accumulator: 0,
-        hyperparameters: Hyperparameters::default(),
-        container_type: ContainerType::GGJT,
-    };
+    (load_progress_callback)(LoadProgress::PartLoading {
+        file: &path,
+        current_part: 0,
+        total_parts: 1,
+    });
+
+    let mut loader = Loader::new(
+        path.clone(),
+        n_context_tokens,
+        prefer_mmap,
+        load_progress_callback,
+    );
+    let use_mmap = loader.mmap_active();
 
     ggml_loader::load_model_from_reader(&mut reader, &mut loader)
         .map_err(|err| LoadError::from_ggml_loader_error(err, path.clone()))?;
 
-    loader.model.ok_or(LoadError::ModelNotCreated { path })
+    let Loader {
+        hyperparameters,
+        vocabulary,
+        tensors,
+        mut load_progress_callback,
+        ..
+    } = loader;
+
+    let Hyperparameters { n_embd, n_mult, .. } = hyperparameters;
+    let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
+
+    let ctx_size = tensors
+        .values()
+        .map(|ti| {
+            ggml::Tensor::C_TYPE_SIZE
+                + ggml::OBJECT_SIZE
+                + if use_mmap { 0 } else { ti.calc_size() }
+        })
+        .sum::<usize>();
+    (load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size });
+    let context = ggml::Context::init(ctx_size, !use_mmap);
+
+    let mmap = if use_mmap {
+        let file = File::open(&path)?;
+        Some(unsafe { Mmap::map(&file)? })
+    } else {
+        None
+    };
+
+    let model = Model::new_loader2(
+        context,
+        hyperparameters,
+        vocabulary,
+        n_ff,
+        path.clone(),
+        &mut file,
+        &tensors,
+        mmap,
+        |tensor_index| {
+            (load_progress_callback)(LoadProgress::PartTensorLoaded {
+                file: &path,
+                current_tensor: tensor_index,
+                tensor_count: tensors.len(),
+            });
+        },
+    )?;
+
+    (load_progress_callback)(LoadProgress::PartLoaded {
+        file: &path,
+        byte_size: 0,
+        tensor_count: tensors.len(),
+    });
+
+    Ok(model)
 }
 
 struct Loader<F: FnMut(LoadProgress)> {
-    // input data and options
+    // Input
     path: PathBuf,
     n_ctx: usize,
     prefer_mmap: bool,
+    load_progress_callback: F,
 
-    // Internal state
-    tensor_accumulator: usize,
+    // Output
     container_type: ContainerType,
     hyperparameters: Hyperparameters,
-    model: Option<Model>,
-    vocab: Vocabulary,
-    load_progress_callback: F,
+    vocabulary: Vocabulary,
+    tensors: HashMap<String, TensorInfo>,
+}
+impl<F: FnMut(LoadProgress)> Loader<F> {
+    fn new(path: PathBuf, n_ctx: usize, prefer_mmap: bool, load_progress_callback: F) -> Self {
+        Self {
+            path,
+            n_ctx,
+            prefer_mmap,
+            load_progress_callback,
+
+            container_type: ContainerType::GGJT,
+            hyperparameters: Hyperparameters::default(),
+            vocabulary: Vocabulary::default(),
+            tensors: HashMap::default(),
+        }
+    }
 }
 
 impl<F: FnMut(LoadProgress)> ggml_loader::LoadHandler<LoadError, BufReader<&File>> for Loader<F> {
@@ -118,150 +188,33 @@ impl<F: FnMut(LoadProgress)> ggml_loader::LoadHandler<LoadError, BufReader<&File
             Ok(id) => id,
             Err(err) => return ControlFlow::Break(LoadError::InvalidIntegerConversion(err)),
         };
-        self.vocab.push_token(id, token, score);
+        self.vocabulary.push_token(id, token, score);
 
         ControlFlow::Continue(())
     }
 
     fn tensor_buffer(&mut self, info: TensorInfo) -> ControlFlow<LoadError, TensorDataTreatment> {
-        let model = match &mut self.model {
-            Some(model) => model,
-            None => {
-                let model = result_to_controlflow(self.create_model(self.vocab.clone()))?;
-                self.model.insert(model)
-            }
-        };
-
-        let tensor_name = match String::from_utf8(info.name) {
+        let tensor_name = match String::from_utf8(info.name.clone()) {
             Ok(n) => n,
             Err(err) => return ControlFlow::Break(LoadError::InvalidUtf8(err)),
         };
 
-        let tensor_count = model.tensors_mut().len();
-
-        // to satisfy borrow checker
-        macro_rules! get_tensor {
-            () => {
-                match model.tensors_mut().get_mut(&tensor_name) {
-                    Some(tensor) => tensor,
-                    None => {
-                        return ControlFlow::Break(LoadError::UnknownTensor {
-                            path: self.path.clone(),
-                            tensor_name,
-                        })
-                    }
-                }
-            };
-        }
-
-        let ret = match &model.mmap {
-            Some(map) => unsafe {
-                let ptr = map.as_ptr().offset(info.start_offset as isize);
-                let tensor = get_tensor!();
-                tensor.set_data(ptr as *mut std::ffi::c_void);
-                TensorDataTreatment::SeekPast {
-                    n_bytes: tensor.nbytes(),
-                }
-            },
-            None => {
-                let tensor = get_tensor!();
-                let buf: &mut [u8] = unsafe {
-                    std::slice::from_raw_parts_mut(tensor.data() as *mut u8, tensor.nbytes())
-                };
-                TensorDataTreatment::CopyInto(buf)
-            }
-        };
-        (self.load_progress_callback)(LoadProgress::PartTensorLoaded {
-            file: &self.path,
-            current_tensor: self.tensor_accumulator,
-            tensor_count,
-        });
-        self.tensor_accumulator += 1;
-
-        ControlFlow::Continue(ret)
+        self.tensors.insert(tensor_name, info);
+        ControlFlow::Continue(TensorDataTreatment::Skip)
     }
 }
 
 impl<F: FnMut(LoadProgress)> Loader<F> {
-    fn create_model(&mut self, vocabulary: Vocabulary) -> Result<Model, LoadError> {
-        (self.load_progress_callback)(LoadProgress::PartLoading {
-            file: &self.path,
-            current_part: 0,
-            total_parts: 1,
-        });
-        let alloc = !(self.use_mmap());
-        let Hyperparameters {
-            n_vocab,
-            n_embd,
-            n_mult,
-            n_layer,
-            element_type,
-            ..
-        } = self.hyperparameters;
-        let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
-        let wtype = element_type;
-        let ctx_size = {
-            // Use 64-bit math to prevent overflow.
-            let mut ctx_size: usize = (5 + 10 * n_layer) * 256; // object overhead
-
-            if alloc {
-                let mut model_size: usize = 0;
-
-                ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // tok_embeddings
-                ctx_size += mulf!(n_embd, ggml::type_sizef(ggml::Type::F32)); // norm
-                ctx_size += mulf!(n_embd, n_vocab, ggml::type_sizef(wtype)); // output
-
-                model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // attention_norm
-
-                model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wq
-                model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wk
-                model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wv
-                model_size += mulf!(n_layer, n_embd, n_embd, ggml::type_sizef(wtype)); // wo
-
-                model_size += mulf!(n_layer, n_embd, ggml::type_sizef(ggml::Type::F32)); // ffn_norm
-
-                model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w1
-                model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w2
-                model_size += mulf!(n_layer, n_ff, n_embd, ggml::type_sizef(wtype)); // w3
-
-                ctx_size += model_size;
-            }
-
-            (self.load_progress_callback)(LoadProgress::ContextSize { bytes: ctx_size });
-
-            ctx_size
-        };
-        // Initialize the context
-        let context = ggml::Context::init(ctx_size, alloc);
-
-        let mmap = if self.use_mmap() {
-            let file = File::open(&self.path)?;
-            Some(unsafe { Mmap::map(&file)? })
-        } else {
-            None
-        };
-
-        Ok(Model::new(
-            context,
-            self.hyperparameters,
-            vocabulary,
-            n_ff,
-            wtype,
-            self.container_type,
-            mmap,
-        ))
-    }
-
-    fn use_mmap(&mut self) -> bool {
+    fn mmap_active(&mut self) -> bool {
         self.prefer_mmap && self.container_type.support_mmap()
     }
 }
 
 /// use this to load params for llama model inside [`LoadHandler::load_hyper_parameters`]
-fn load_hyperparameters<T, R: BufRead + Seek>(
+fn load_hyperparameters<R: BufRead + Seek>(
     reader: &mut R,
     n_ctx: usize,
-) -> Result<(Hyperparameters, PartialHyperparameters), ggml_loader::LoadError<T>> {
+) -> Result<(Hyperparameters, PartialHyperparameters), ggml_loader::LoadError<LoadError>> {
     // NOTE: Field order matters! Data is laid out in the file exactly in this order.
     let hparams = Hyperparameters {
         n_vocab: read_i32(reader)?.try_into()?,
@@ -270,7 +223,12 @@ fn load_hyperparameters<T, R: BufRead + Seek>(
         n_head: read_i32(reader)?.try_into()?,
         n_layer: read_i32(reader)?.try_into()?,
         n_rot: read_i32(reader)?.try_into()?,
-        element_type: decode_element_type_res(read_i32(reader)?)?,
+        file_type: {
+            let ftype = read_i32(reader)?;
+            FileType::try_from(ftype).map_err(|_| {
+                ggml_loader::LoadError::UserInterrupted(LoadError::UnsupportedFileType(ftype))
+            })?
+        },
         n_ctx,
     };
     let partial = PartialHyperparameters {
diff --git a/llama-rs/src/loader_common.rs b/llama-rs/src/loader_common.rs
index 4a219642..fe44da46 100644
--- a/llama-rs/src/loader_common.rs
+++ b/llama-rs/src/loader_common.rs
@@ -1,9 +1,75 @@
-use std::path::{Path, PathBuf};
+use std::{
+    fmt::{Display, Formatter},
+    path::{Path, PathBuf},
+};
 
 use thiserror::Error;
 
 use crate::{util::FindAllModelFilesError, Hyperparameters};
 
+/// How the tensors are stored in the GGML LLaMA model.
+#[derive(Debug, PartialEq, Clone, Copy, Eq, Default)]
+pub enum FileType {
+    /// All tensors are stored as f32.
+    F32,
+    #[default]
+    /// All tensors are mostly stored as `f16`, except for the 1D tensors (32-bit).
+    MostlyF16,
+    /// All tensors are mostly stored as `Q4_0`, except for the 1D tensors (32-bit).
+    MostlyQ4_0,
+    /// All tensors are mostly stored as `Q4_1`, except for the 1D tensors (32-bit)
+    MostlyQ4_1,
+    /// All tensors are mostly stored as `Q4_1`, except for the 1D tensors (32-bit)
+    /// and the `tok_embeddings.weight` (f16) and `output.weight` tensors (f16).
+    MostlyQ4_1SomeF16,
+    /// All tensors are mostly stored as `Q4_2`, except for the 1D tensors (32-bit).
+    MostlyQ4_2,
+    /// All tensors are mostly stored as `Q4_3`, except for the 1D tensors (32-bit).
+    MostlyQ4_3,
+}
+impl From<FileType> for i32 {
+    fn from(value: FileType) -> Self {
+        match value {
+            FileType::F32 => 0,
+            FileType::MostlyF16 => 1,
+            FileType::MostlyQ4_0 => 2,
+            FileType::MostlyQ4_1 => 3,
+            FileType::MostlyQ4_1SomeF16 => 4,
+            FileType::MostlyQ4_2 => 5,
+            FileType::MostlyQ4_3 => 6,
+        }
+    }
+}
+impl TryFrom<i32> for FileType {
+    type Error = ();
+
+    fn try_from(value: i32) -> Result<Self, Self::Error> {
+        match value {
+            0 => Ok(FileType::F32),
+            1 => Ok(FileType::MostlyF16),
+            2 => Ok(FileType::MostlyQ4_0),
+            3 => Ok(FileType::MostlyQ4_1),
+            4 => Ok(FileType::MostlyQ4_1SomeF16),
+            5 => Ok(FileType::MostlyQ4_2),
+            6 => Ok(FileType::MostlyQ4_3),
+            _ => Err(()),
+        }
+    }
+}
+impl Display for FileType {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            FileType::F32 => write!(f, "f32"),
+            FileType::MostlyF16 => write!(f, "f16"),
+            FileType::MostlyQ4_0 => write!(f, "q4_0"),
+            FileType::MostlyQ4_1 => write!(f, "q4_1"),
+            FileType::MostlyQ4_1SomeF16 => write!(f, "q4_1_with_f16"),
+            FileType::MostlyQ4_2 => write!(f, "q4_2"),
+            FileType::MostlyQ4_3 => write!(f, "q4_3"),
+        }
+    }
+}
+
 /// Each variant represents a step within the process of loading the model.
 /// These can be used to report progress to the user.
 #[derive(Clone, PartialEq, Eq, Debug)]
@@ -79,8 +145,8 @@ pub enum LoadError {
     /// One of the integers encountered could not be converted to a more appropriate type.
     InvalidIntegerConversion(#[from] std::num::TryFromIntError),
     #[error("unsupported f16_: {0}")]
-    /// One of the integers encountered could not be converted to a more appropriate type.
-    UnsupportedElementType(i32),
+    /// The `f16_` hyperparameter had an invalid value.
+    UnsupportedFileType(i32),
     #[error("invalid magic number for {path:?}")]
     /// An invalid magic number was encountered during the loading process.
     InvalidMagic {
diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs
index 6cd64dc1..635ffaec 100644
--- a/llama-rs/src/model.rs
+++ b/llama-rs/src/model.rs
@@ -1,12 +1,18 @@
-use std::{collections::HashMap, path::Path};
+use std::{
+    collections::HashMap,
+    fs::File,
+    io::{Read, Seek, SeekFrom},
+    path::{Path, PathBuf},
+};
 
 use crate::{
-    loader, loader2, vocabulary::TokenId, EvaluateOutputRequest, InferenceParameters,
-    InferenceSession, InferenceSessionParameters, LoadError, LoadProgress, Vocabulary,
+    loader, loader2, loader_common::FileType, vocabulary::TokenId, EvaluateOutputRequest,
+    InferenceParameters, InferenceSession, InferenceSessionParameters, LoadError, LoadProgress,
+    Vocabulary,
 };
 use memmap2::Mmap;
 
-use ggml_loader::ContainerType;
+use ggml_loader::TensorInfo;
 
 /// The weights for the LLaMA model. All the mutable state is split into a
 /// separate struct `InferenceSession`.
@@ -25,21 +31,18 @@ pub struct Model {
     tensors: HashMap<String, ggml::Tensor>,
 
     /// Needs to kept alive while the model is alive
-    pub(crate) mmap: Option<Mmap>,
-
-    _version: ContainerType,
+    _mmap: Option<Mmap>,
 
     // Must be kept alive for the model
     _context: ggml::Context,
 }
 impl Model {
-    pub(crate) fn new(
+    pub(crate) fn new_loader1(
         context: ggml::Context,
         hparams: Hyperparameters,
         vocabulary: Vocabulary,
         n_ff: usize,
         wtype: ggml::Type,
-        container_type: ContainerType,
         mmap: Option<Mmap>,
     ) -> Model {
         let n_embd = hparams.n_embd;
@@ -110,9 +113,151 @@ impl Model {
             layers,
             tensors,
             _context: context,
-            mmap,
-            _version: container_type,
+            _mmap: mmap,
+        }
+    }
+
+    pub(crate) fn new_loader2(
+        context: ggml::Context,
+        hyperparameters: Hyperparameters,
+        vocabulary: Vocabulary,
+        n_ff: usize,
+        path: PathBuf,
+        file: &mut File,
+        tensors: &HashMap<String, TensorInfo>,
+        mmap: Option<Mmap>,
+        progress_callback: impl FnMut(usize),
+    ) -> Result<Model, LoadError> {
+        let n_embd = hyperparameters.n_embd;
+        let n_layer = hyperparameters.n_layer;
+        let n_vocab = hyperparameters.n_vocab;
+
+        struct TensorLoader<'a, F: FnMut(usize)> {
+            // Input
+            path: PathBuf,
+            file: &'a mut File,
+            tensors: &'a HashMap<String, TensorInfo>,
+            context: &'a ggml::Context,
+            mmap_ptr: Option<*const u8>,
+            progress_callback: F,
+
+            // Output
+            loaded_tensors: HashMap<String, ggml::Tensor>,
+        }
+        impl<F: FnMut(usize)> TensorLoader<'_, F> {
+            fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, LoadError> {
+                let info = self
+                    .tensors
+                    .get(name)
+                    .ok_or_else(|| LoadError::UnknownTensor {
+                        path: self.path.clone(),
+                        tensor_name: name.to_owned(),
+                    })?;
+
+                let ctx = self.context;
+                let mut tensor = match ne.len() {
+                    1 => ctx.new_tensor_1d(info.element_type, ne[0]),
+                    2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]),
+                    3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]),
+                    _ => {
+                        return Err(LoadError::InvariantBroken {
+                            path: self.path.clone(),
+                            invariant: format!(
+                                "the tensor {name} had an unsupported dimension count: {ne:?}"
+                            ),
+                        })
+                    }
+                };
+
+                match self.mmap_ptr {
+                    Some(mmap) => unsafe {
+                        let ptr = mmap.offset(info.start_offset as isize);
+                        tensor.set_data(ptr as *mut std::ffi::c_void);
+                    },
+                    None => {
+                        let buf: &mut [u8] = unsafe {
+                            std::slice::from_raw_parts_mut(
+                                tensor.data() as *mut u8,
+                                tensor.nbytes(),
+                            )
+                        };
+                        self.file.seek(SeekFrom::Start(info.start_offset))?;
+                        self.file.read_exact(buf)?;
+                    }
+                }
+
+                self.loaded_tensors.insert(name.to_owned(), tensor.share());
+                (self.progress_callback)(self.loaded_tensors.len());
+
+                Ok(tensor)
+            }
+        }
+        let mut tl = TensorLoader {
+            path,
+            file,
+            tensors,
+            context: &context,
+            mmap_ptr: mmap.as_ref().map(|m| m.as_ptr()),
+            progress_callback,
+
+            loaded_tensors: Default::default(),
+        };
+
+        let tok_embeddings = tl.load("tok_embeddings.weight", &[n_embd, n_vocab])?;
+        let norm = tl.load("norm.weight", &[n_embd])?;
+        let output = tl.load("output.weight", &[n_embd, n_vocab])?;
+
+        let mut layers = Vec::new();
+        for i in 0..n_layer {
+            let layer = Layer {
+                attention_norm: tl.load(&format!("layers.{i}.attention_norm.weight"), &[n_embd])?,
+                wq: tl.load(
+                    &format!("layers.{i}.attention.wq.weight"),
+                    &[n_embd, n_embd],
+                )?,
+                wk: tl.load(
+                    &format!("layers.{i}.attention.wk.weight"),
+                    &[n_embd, n_embd],
+                )?,
+                wv: tl.load(
+                    &format!("layers.{i}.attention.wv.weight"),
+                    &[n_embd, n_embd],
+                )?,
+                wo: tl.load(
+                    &format!("layers.{i}.attention.wo.weight"),
+                    &[n_embd, n_embd],
+                )?,
+                ffn_norm: tl.load(&format!("layers.{i}.ffn_norm.weight"), &[n_embd])?,
+                w1: tl.load(
+                    &format!("layers.{i}.feed_forward.w1.weight"),
+                    &[n_embd, n_ff],
+                )?,
+                w2: tl.load(
+                    &format!("layers.{i}.feed_forward.w2.weight"),
+                    &[n_ff, n_embd],
+                )?,
+                w3: tl.load(
+                    &format!("layers.{i}.feed_forward.w3.weight"),
+                    &[n_embd, n_ff],
+                )?,
+            };
+
+            layers.push(layer);
         }
+
+        let tensors = tl.loaded_tensors;
+
+        Ok(Model {
+            hparams: hyperparameters,
+            vocabulary,
+            tok_embeddings,
+            norm,
+            output,
+            layers,
+            tensors,
+            _context: context,
+            _mmap: mmap,
+        })
     }
 
     /// Load the model from `path` with `n_context_tokens` context tokens.
@@ -180,7 +325,7 @@ impl Model {
             n_head,
             n_layer,
             n_rot,
-            element_type: _,
+            file_type: _,
         } = self.hparams;
 
         // For the first run, we need to guess a maximum buffer size so we can measure
@@ -472,8 +617,8 @@ pub struct Hyperparameters {
     pub n_layer: usize,
     /// n_rot
     pub n_rot: usize,
-    /// element_type
-    pub element_type: crate::ElementType,
+    /// file_type
+    pub file_type: FileType,
 }
 
 struct Layer {

From 5e5f3ccbef646b7f9170301d564e3f2310512e61 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Mon, 24 Apr 2023 02:58:50 +0200
Subject: [PATCH 2/4] chore: ignore too many arguments

---
 llama-rs/src/model.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs
index 635ffaec..4b158f14 100644
--- a/llama-rs/src/model.rs
+++ b/llama-rs/src/model.rs
@@ -117,6 +117,7 @@ impl Model {
         }
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub(crate) fn new_loader2(
         context: ggml::Context,
         hyperparameters: Hyperparameters,

From ecb9175ed38bfc2d962a2787f45aae914f57b4c4 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Mon, 24 Apr 2023 03:09:33 +0200
Subject: [PATCH 3/4] chore: hide Model internals

---
 llama-rs/src/inference_session.rs |  4 ++--
 llama-rs/src/model.rs             | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/llama-rs/src/inference_session.rs b/llama-rs/src/inference_session.rs
index 428b9a7b..e4e9c244 100644
--- a/llama-rs/src/inference_session.rs
+++ b/llama-rs/src/inference_session.rs
@@ -68,7 +68,7 @@ impl InferenceSession {
             .map(|(_, tok)| *tok)
             .collect();
 
-        if self.n_past + prompt_tokens.len() >= model.hparams.n_ctx {
+        if self.n_past + prompt_tokens.len() >= model.n_ctx() {
             return Err(InferenceError::ContextFull);
         }
 
@@ -96,7 +96,7 @@ impl InferenceSession {
         params: &InferenceParameters,
         rng: &mut impl rand::Rng,
     ) -> Result<&'v [u8], InferenceError> {
-        if self.n_past + 1 >= model.hparams.n_ctx {
+        if self.n_past + 1 >= model.n_ctx() {
             return Err(InferenceError::ContextFull);
         }
 
diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs
index 4b158f14..6be17b99 100644
--- a/llama-rs/src/model.rs
+++ b/llama-rs/src/model.rs
@@ -17,7 +17,7 @@ use ggml_loader::TensorInfo;
 /// The weights for the LLaMA model. All the mutable state is split into a
 /// separate struct `InferenceSession`.
 pub struct Model {
-    pub(crate) hparams: Hyperparameters,
+    hyperparameters: Hyperparameters,
 
     vocabulary: Vocabulary,
 
@@ -105,7 +105,7 @@ impl Model {
         }
 
         Model {
-            hparams,
+            hyperparameters: hparams,
             vocabulary,
             tok_embeddings,
             norm,
@@ -249,7 +249,7 @@ impl Model {
         let tensors = tl.loaded_tensors;
 
         Ok(Model {
-            hparams: hyperparameters,
+            hyperparameters,
             vocabulary,
             tok_embeddings,
             norm,
@@ -291,10 +291,10 @@ impl Model {
     pub fn start_session(&self, params: InferenceSessionParameters) -> InferenceSession {
         InferenceSession::new(
             params,
-            self.hparams.n_ctx,
-            self.hparams.n_layer,
-            self.hparams.n_embd,
-            self.hparams.n_vocab,
+            self.hyperparameters.n_ctx,
+            self.hyperparameters.n_layer,
+            self.hyperparameters.n_embd,
+            self.hyperparameters.n_vocab,
         )
     }
 
@@ -327,7 +327,7 @@ impl Model {
             n_layer,
             n_rot,
             file_type: _,
-        } = self.hparams;
+        } = self.hyperparameters;
 
         // For the first run, we need to guess a maximum buffer size so we can measure
         // the actual memory consumption of the temporary ggml context.
@@ -599,6 +599,10 @@ impl Model {
     pub(crate) fn tensors_mut(&mut self) -> &mut HashMap<String, ggml::Tensor> {
         &mut self.tensors
     }
+
+    pub(crate) fn n_ctx(&self) -> usize {
+        self.hyperparameters.n_ctx
+    }
 }
 
 /// The hyperparameters of the model.

From c9e5c2659fd9aa3bdde685c36096b968eaa0d97a Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Tue, 25 Apr 2023 03:33:08 +0200
Subject: [PATCH 4/4] refactor: decouple loading from model

---
 llama-rs/src/loader2.rs |  98 ++++++++++++++++++++++++++++++--------
 llama-rs/src/model.rs   | 103 +++++-----------------------------------
 2 files changed, 91 insertions(+), 110 deletions(-)

diff --git a/llama-rs/src/loader2.rs b/llama-rs/src/loader2.rs
index 38413df0..ead8bfb3 100644
--- a/llama-rs/src/loader2.rs
+++ b/llama-rs/src/loader2.rs
@@ -5,14 +5,14 @@ use memmap2::Mmap;
 use std::{
     collections::HashMap,
     fs::File,
-    io::{BufRead, BufReader, Seek},
+    io::{BufRead, BufReader, Read, Seek},
     ops::ControlFlow,
     path::{Path, PathBuf},
 };
 
 use crate::{
-    loader_common::FileType, util, Hyperparameters, LoadError, LoadProgress, Model, TokenId,
-    Vocabulary,
+    loader_common::FileType, model::TensorLoader, util, Hyperparameters, LoadError, LoadProgress,
+    Model, TokenId, Vocabulary,
 };
 
 impl LoadError {
@@ -48,7 +48,7 @@ pub(crate) fn load(
         return Err(LoadError::MultipartNotSupported { paths });
     }
 
-    let mut file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
+    let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
         source: e,
         path: main_path.to_owned(),
     })?;
@@ -102,28 +102,86 @@ pub(crate) fn load(
         None
     };
 
-    let model = Model::new_loader2(
+    struct TensorLoader2<'a> {
+        path: PathBuf,
+        file: File,
+        tensors: HashMap<String, TensorInfo>,
+        context: ggml::Context,
+        mmap: Option<Mmap>,
+        load_progress_callback: &'a mut dyn FnMut(LoadProgress),
+        loaded_tensors: HashMap<String, ggml::Tensor>,
+    }
+    impl TensorLoader<LoadError> for TensorLoader2<'_> {
+        fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, LoadError> {
+            let info = self
+                .tensors
+                .get(name)
+                .ok_or_else(|| LoadError::UnknownTensor {
+                    path: self.path.clone(),
+                    tensor_name: name.to_owned(),
+                })?;
+
+            let ctx = &self.context;
+            let mut tensor = match ne.len() {
+                1 => ctx.new_tensor_1d(info.element_type, ne[0]),
+                2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]),
+                3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]),
+                _ => {
+                    return Err(LoadError::InvariantBroken {
+                        path: self.path.clone(),
+                        invariant: format!(
+                            "the tensor {name} had an unsupported dimension count: {ne:?}"
+                        ),
+                    })
+                }
+            };
+
+            match self.mmap.as_ref() {
+                Some(mmap) => unsafe {
+                    let ptr = mmap.as_ptr().offset(info.start_offset as isize);
+                    tensor.set_data(ptr as *mut std::ffi::c_void);
+                },
+                None => {
+                    let buf: &mut [u8] = unsafe {
+                        std::slice::from_raw_parts_mut(tensor.data() as *mut u8, tensor.nbytes())
+                    };
+                    self.file.seek(SeekFrom::Start(info.start_offset))?;
+                    self.file.read_exact(buf)?;
+                }
+            }
+
+            self.loaded_tensors.insert(name.to_owned(), tensor.share());
+            (self.load_progress_callback)(LoadProgress::PartTensorLoaded {
+                file: &self.path,
+                current_tensor: self.loaded_tensors.len(),
+                tensor_count: self.tensors.len(),
+            });
+
+            Ok(tensor)
+        }
+
+        fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>) {
+            (self.context, self.loaded_tensors, self.mmap)
+        }
+    }
+
+    let tensors_len = tensors.len();
+    let tl = TensorLoader2 {
+        path: path.clone(),
+        file,
+        tensors,
         context,
-        hyperparameters,
-        vocabulary,
-        n_ff,
-        path.clone(),
-        &mut file,
-        &tensors,
         mmap,
-        |tensor_index| {
-            (load_progress_callback)(LoadProgress::PartTensorLoaded {
-                file: &path,
-                current_tensor: tensor_index,
-                tensor_count: tensors.len(),
-            });
-        },
-    )?;
+        load_progress_callback: &mut load_progress_callback,
+        loaded_tensors: Default::default(),
+    };
+
+    let model = Model::new_loader2(hyperparameters, vocabulary, n_ff, tl)?;
 
     (load_progress_callback)(LoadProgress::PartLoaded {
         file: &path,
         byte_size: 0,
-        tensor_count: tensors.len(),
+        tensor_count: tensors_len,
     });
 
     Ok(model)
diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs
index 6be17b99..13488a2d 100644
--- a/llama-rs/src/model.rs
+++ b/llama-rs/src/model.rs
@@ -1,9 +1,4 @@
-use std::{
-    collections::HashMap,
-    fs::File,
-    io::{Read, Seek, SeekFrom},
-    path::{Path, PathBuf},
-};
+use std::{collections::HashMap, error::Error, path::Path};
 
 use crate::{
     loader, loader2, loader_common::FileType, vocabulary::TokenId, EvaluateOutputRequest,
@@ -12,8 +7,6 @@ use crate::{
 };
 use memmap2::Mmap;
 
-use ggml_loader::TensorInfo;
-
 /// The weights for the LLaMA model. All the mutable state is split into a
 /// separate struct `InferenceSession`.
 pub struct Model {
@@ -117,92 +110,17 @@ impl Model {
         }
     }
 
-    #[allow(clippy::too_many_arguments)]
-    pub(crate) fn new_loader2(
-        context: ggml::Context,
+    pub(crate) fn new_loader2<E: Error>(
         hyperparameters: Hyperparameters,
         vocabulary: Vocabulary,
         n_ff: usize,
-        path: PathBuf,
-        file: &mut File,
-        tensors: &HashMap<String, TensorInfo>,
-        mmap: Option<Mmap>,
-        progress_callback: impl FnMut(usize),
-    ) -> Result<Model, LoadError> {
+        tensor_loader: impl TensorLoader<E>,
+    ) -> Result<Model, E> {
         let n_embd = hyperparameters.n_embd;
         let n_layer = hyperparameters.n_layer;
         let n_vocab = hyperparameters.n_vocab;
 
-        struct TensorLoader<'a, F: FnMut(usize)> {
-            // Input
-            path: PathBuf,
-            file: &'a mut File,
-            tensors: &'a HashMap<String, TensorInfo>,
-            context: &'a ggml::Context,
-            mmap_ptr: Option<*const u8>,
-            progress_callback: F,
-
-            // Output
-            loaded_tensors: HashMap<String, ggml::Tensor>,
-        }
-        impl<F: FnMut(usize)> TensorLoader<'_, F> {
-            fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, LoadError> {
-                let info = self
-                    .tensors
-                    .get(name)
-                    .ok_or_else(|| LoadError::UnknownTensor {
-                        path: self.path.clone(),
-                        tensor_name: name.to_owned(),
-                    })?;
-
-                let ctx = self.context;
-                let mut tensor = match ne.len() {
-                    1 => ctx.new_tensor_1d(info.element_type, ne[0]),
-                    2 => ctx.new_tensor_2d(info.element_type, ne[0], ne[1]),
-                    3 => ctx.new_tensor_3d(info.element_type, ne[0], ne[1], ne[2]),
-                    _ => {
-                        return Err(LoadError::InvariantBroken {
-                            path: self.path.clone(),
-                            invariant: format!(
-                                "the tensor {name} had an unsupported dimension count: {ne:?}"
-                            ),
-                        })
-                    }
-                };
-
-                match self.mmap_ptr {
-                    Some(mmap) => unsafe {
-                        let ptr = mmap.offset(info.start_offset as isize);
-                        tensor.set_data(ptr as *mut std::ffi::c_void);
-                    },
-                    None => {
-                        let buf: &mut [u8] = unsafe {
-                            std::slice::from_raw_parts_mut(
-                                tensor.data() as *mut u8,
-                                tensor.nbytes(),
-                            )
-                        };
-                        self.file.seek(SeekFrom::Start(info.start_offset))?;
-                        self.file.read_exact(buf)?;
-                    }
-                }
-
-                self.loaded_tensors.insert(name.to_owned(), tensor.share());
-                (self.progress_callback)(self.loaded_tensors.len());
-
-                Ok(tensor)
-            }
-        }
-        let mut tl = TensorLoader {
-            path,
-            file,
-            tensors,
-            context: &context,
-            mmap_ptr: mmap.as_ref().map(|m| m.as_ptr()),
-            progress_callback,
-
-            loaded_tensors: Default::default(),
-        };
+        let mut tl = tensor_loader;
 
         let tok_embeddings = tl.load("tok_embeddings.weight", &[n_embd, n_vocab])?;
         let norm = tl.load("norm.weight", &[n_embd])?;
@@ -246,7 +164,7 @@ impl Model {
             layers.push(layer);
         }
 
-        let tensors = tl.loaded_tensors;
+        let (_context, tensors, _mmap) = tl.finish();
 
         Ok(Model {
             hyperparameters,
@@ -256,8 +174,8 @@ impl Model {
             output,
             layers,
             tensors,
-            _context: context,
-            _mmap: mmap,
+            _context,
+            _mmap,
         })
     }
 
@@ -626,6 +544,11 @@ pub struct Hyperparameters {
     pub file_type: FileType,
 }
 
+pub(crate) trait TensorLoader<E: Error> {
+    fn load(&mut self, name: &str, ne: &[usize]) -> Result<ggml::Tensor, E>;
+    fn finish(self) -> (ggml::Context, HashMap<String, ggml::Tensor>, Option<Mmap>);
+}
+
 struct Layer {
     attention_norm: ggml::Tensor,