From decff8b53afefd239d51f20a790419c66559de5c Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 3 Mar 2026 18:15:49 -0600
Subject: [PATCH 01/23] quantize : imatrix-fail early + code cleanup

---
 src/llama-quant.cpp         | 712 ++++++++++++++++++++++--------------
 src/llama-quant.h           |  24 ++
 tools/quantize/quantize.cpp |  19 +-
 3 files changed, 460 insertions(+), 295 deletions(-)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 24770430e1c..58ed0e9db7a 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -1,11 +1,12 @@
+#include "llama.h"
 #include "llama-quant.h"
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-model-loader.h"
 
-#include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <string>
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
@@ -13,12 +14,6 @@
 #include <thread>
 #include <unordered_map>
 
-// Quantization types. Changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -54,7 +49,7 @@ static std::string remap_layer(const std::string & orig_name, const std::vector<
     return orig_name;
 }
 
-static std::string remap_imatrix (const std::string & orig_name, const std::map<int, std::string> & mapped) {
+static std::string remap_imatrix(const std::string & orig_name, const std::map<int, std::string> & mapped) {
     if (mapped.empty()) {
         return orig_name;
     }
@@ -76,6 +71,73 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map<
     return orig_name;
 }
 
+//
+// helper functions for tensor name matching
+//
+
+static bool tensor_name_match_token_embd(const char * tensor_name) {
+    return std::strcmp(tensor_name, "token_embd.weight") == 0 ||
+           std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0;
+}
+
+static bool tensor_name_match_output_weight(const char * tensor_name) {
+    return std::strcmp(tensor_name, "output.weight") == 0;
+}
+
+//
+// tensor categorization for quantization
+//
+// (this is different from LLM_TN - we want broad categories, not specific tensor names per arch)
+//
+
+static tensor_category tensor_get_category(const std::string & tensor_name) {
+    if (tensor_name_match_output_weight(tensor_name.c_str())) {
+        return tensor_category::OUTPUT;
+    }
+    if (tensor_name_match_token_embd(tensor_name.c_str())) {
+        return tensor_category::TOKEN_EMBD;
+    }
+    if (tensor_name.find("attn_qkv.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_QKV;
+    }
+    if (tensor_name.find("attn_kv_b.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_KV_B;
+    }
+    if (tensor_name.find("attn_v.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_V;
+    }
+    if (tensor_name.find("attn_k.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_K;
+    }
+    if (tensor_name.find("attn_q.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_Q;
+    }
+    if (tensor_name.find("attn_output.weight") != std::string::npos) {
+        return tensor_category::ATTENTION_OUTPUT;
+    }
+    if (tensor_name.find("ffn_up") != std::string::npos) {
+        return tensor_category::FFN_UP;
+    }
+    if (tensor_name.find("ffn_gate") != std::string::npos) {
+        return tensor_category::FFN_GATE;
+    }
+    if (tensor_name.find("ffn_down") != std::string::npos) {
+        return tensor_category::FFN_DOWN;
+    }
+    return tensor_category::OTHER;
+}
+
+// check if category is for attention-v-like tensors (more sensitive to quantization)
+static bool category_is_attn_v(tensor_category cat) {
+    return cat == tensor_category::ATTENTION_V     ||
+           cat == tensor_category::ATTENTION_QKV   ||
+           cat == tensor_category::ATTENTION_KV_B;
+}
+
+//
+// quantization state
+//
+
 struct quantize_state_impl {
     const llama_model                 & model;
     const llama_model_quantize_params * params;
@@ -89,20 +151,43 @@ struct quantize_state_impl {
     int i_ffn_gate     = 0;
     int i_ffn_up       = 0;
 
-    int n_k_quantized = 0;
     int n_fallback    = 0;
 
     bool has_imatrix = false;
 
-    // used to figure out if a model shares tok_embd with the output weight
-    bool has_output = false;
+    // used to figure out if a model has tied embeddings (tok_embd shares weights with output)
+    bool has_tied_embeddings = false; // assume tied until we see output.weight
+
+    // tensor type override patterns (compiled once, used twice)
+    std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;
 
     quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params)
         : model(model)
         , params(params)
-        {}
+    {
+        // compile regex patterns once - they are expensive
+        if (params->tensor_types) {
+            const auto & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+            for (const auto & [tname, qtype] : tensor_types) {
+                tensor_type_patterns.emplace_back(std::regex(tname), qtype);
+            }
+        }
+    }
 };
 
+// per-tensor metadata, computed in the preliminary loop and used in the main loop
+struct tensor_metadata {
+    ggml_type       target_type;
+    tensor_category category;
+    std::string     remapped_imatrix_name;
+    bool            allows_quantization;
+    bool            requires_imatrix;
+};
+
+//
+// dequantization
+//
+
 static void llama_tensor_dequantize_impl(
     ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
     const size_t nelements, const int nthread
@@ -175,12 +260,132 @@ static void llama_tensor_dequantize_impl(
     workers.clear();
 }
 
-static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) {
+//
+// do we allow this tensor to be quantized?
+//
+
+static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) {
+    // trivial checks first -- no string ops needed
+    if (params->only_copy)       return false;
+
+    // quantize only 2D and 3D tensors (experts)
+    if (ggml_n_dims(tensor) < 2) return false;
+
+    const std::string name = ggml_get_name(tensor);
+
+    // This used to be a regex, but <regex> has an extreme cost to compile times.
+    bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
+
+    // do not quantize norm tensors
+    quantize &= name.find("_norm.weight") == std::string::npos;
+
+    quantize &= params->quantize_output_tensor || name != "output.weight";
+
+    // do not quantize expert gating tensors
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
+
+    // these are very small (e.g. 4x4)
+    quantize &= name.find("altup")  == std::string::npos;
+    quantize &= name.find("laurel") == std::string::npos;
+
+    // these are not too big so keep them as it is
+    quantize &= name.find("per_layer_model_proj") == std::string::npos;
+
+    // do not quantize positional embeddings and token types (BERT)
+    quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD,    "weight");
+    quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
+
+    // do not quantize Mamba/Kimi's small conv1d weights
+    // NOTE: can't use LLM_TN here because the layer number is not known
+    quantize &= name.find("ssm_conv1d") == std::string::npos;
+    quantize &= name.find("shortconv.conv.weight") == std::string::npos;
+
+    // do not quantize RWKV's small yet 2D weights
+    quantize &= name.find("time_mix_first.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_w2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_v2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a0.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_a2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_g1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_g2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
+    quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
+    quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
+
+    // do not quantize relative position bias (T5)
+    quantize &= name.find("attn_rel_b.weight") == std::string::npos;
+
+    // do not quantize specific multimodal tensors
+    quantize &= name.find(".position_embd.") == std::string::npos;
+
+    return quantize;
+}
+
+//
+// tensor type selection
+//
+
+// incompatible tensor shapes are handled here - fallback to a compatible type
+static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) {
+    ggml_type return_type = target_type;
+
+    const int64_t ncols = t->ne[0];
+    const int64_t qk_k = ggml_blck_size(target_type);
+
+    if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant
+        LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ",
+                        t->name, ncols, qk_k, ggml_type_name(target_type));
+        ++qs.n_fallback;
+
+        switch (target_type) {
+            // types on the left: block size 256
+            case GGML_TYPE_IQ1_S:
+            case GGML_TYPE_IQ1_M:
+            case GGML_TYPE_IQ2_XXS:
+            case GGML_TYPE_IQ2_XS:
+            case GGML_TYPE_IQ2_S:
+            case GGML_TYPE_IQ3_XXS:
+            case GGML_TYPE_IQ3_S:   // types on the right: block size 32
+            case GGML_TYPE_IQ4_XS:  return_type = GGML_TYPE_IQ4_NL; break;
+            case GGML_TYPE_Q2_K:
+            case GGML_TYPE_Q3_K:
+            case GGML_TYPE_TQ1_0:
+            case GGML_TYPE_TQ2_0:   return_type = GGML_TYPE_Q4_0;   break;
+            case GGML_TYPE_Q4_K:    return_type = GGML_TYPE_Q5_0;   break;
+            case GGML_TYPE_Q5_K:    return_type = GGML_TYPE_Q5_1;   break;
+            case GGML_TYPE_Q6_K:    return_type = GGML_TYPE_Q8_0;   break;
+            default:
+                throw std::runtime_error(format("no tensor type fallback is defined for type %s",
+                                                ggml_type_name(target_type)));
+        }
+        if (ncols % ggml_blck_size(return_type) != 0) {
+            //
+            // the fallback return type is still not compatible for this tensor!
+            //
+            // most likely, this tensor's first dimension is not divisible by 32.
+            // this is very rare. we can either abort the quantization, or
+            // fallback to F16 / F32.
+            //
+            LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) ");
+            return_type = GGML_TYPE_F16;
+        }
+        LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type));
+    }
+    return return_type;
+}
+
+// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch
+static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) {
     const std::string name = ggml_get_name(tensor);
 
     // TODO: avoid hardcoded tensor names - use the TN_* constants
     const llm_arch arch = qs.model.arch;
-    const auto       tn = LLM_TN(arch);
 
     auto use_more_bits = [](int i_layer, int n_layers) -> bool {
         return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2;
@@ -204,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
 
     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
     // with the quantization of the output tensor
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+    if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) {
         if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
             new_type = qs.params->output_tensor_type;
         } else {
@@ -234,7 +439,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         } else {
             new_type = GGML_TYPE_Q8_0;
         }
-    } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") {
+    } else if (category == tensor_category::TOKEN_EMBD) {
         if (qs.params->token_embedding_type < GGML_TYPE_COUNT) {
             new_type = qs.params->token_embedding_type;
         } else {
@@ -254,21 +459,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
-        if (name.find("attn_v.weight") != std::string::npos) {
+        if (category_is_attn_v(category)) {
             if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
             else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             ++qs.i_attention_wv;
         }
-        else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
+        else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (name.find("ffn_down") != std::string::npos) {
+        else if (category == tensor_category::FFN_DOWN) {
             if (qs.i_ffn_down < qs.n_ffn_down/8) {
                 new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
             }
             ++qs.i_ffn_down;
         }
-        else if (name.find("attn_output.weight") != std::string::npos) {
+        else if (category == tensor_category::ATTENTION_OUTPUT) {
             if (qs.model.hparams.n_expert == 8) {
                 new_type = GGML_TYPE_Q5_K;
             } else {
@@ -276,7 +481,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                 else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
             }
         }
-    } else if (name.find("attn_v.weight") != std::string::npos) {
+    } else if (category_is_attn_v(category)) {
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
             new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
         }
@@ -314,7 +519,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = GGML_TYPE_Q8_0;
         }
         ++qs.i_attention_wv;
-    } else if (name.find("attn_k.weight") != std::string::npos) {
+    } else if (category == tensor_category::ATTENTION_K) {
         if (qs.model.hparams.n_expert == 8) {
             // for the 8-expert model, bumping this to Q8_0 trades just ~128MB
             // TODO: explore better strategies
@@ -326,14 +531,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ2_S;
         }
-    } else if (name.find("attn_q.weight") != std::string::npos) {
+    } else if (category == tensor_category::ATTENTION_Q) {
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
             new_type = GGML_TYPE_IQ3_XXS;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
             new_type = GGML_TYPE_IQ2_S;
         }
-    } else if (name.find("ffn_down") != std::string::npos) {
+    } else if (category == tensor_category::FFN_DOWN) {
         auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@@ -378,7 +583,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
         }
         ++qs.i_ffn_down;
-    } else if (name.find("attn_output.weight") != std::string::npos) {
+    } else if (category == tensor_category::ATTENTION_OUTPUT) {
         if (arch != LLM_ARCH_FALCON) {
             if (qs.model.hparams.n_expert == 8) {
                 if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
@@ -398,14 +603,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
         }
     }
-    else if (name.find("attn_qkv.weight") != std::string::npos) {
+    else if (category == tensor_category::ATTENTION_QKV) {
         if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
             new_type = GGML_TYPE_Q4_K;
         }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
-    else if (name.find("ffn_gate") != std::string::npos) {
+    else if (category == tensor_category::FFN_GATE) {
         auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
@@ -413,7 +618,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         }
         ++qs.i_ffn_gate;
     }
-    else if (name.find("ffn_up") != std::string::npos) {
+    else if (category == tensor_category::FFN_UP) {
         auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
         int i_layer = info.first, n_layer = info.second;
         if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
@@ -425,6 +630,54 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
     return new_type;
 }
 
+// outer wrapper: determine the ggml_type that this tensor should be quantized to
+static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) {
+    if (!tensor_allows_quantization(params, qs.model.arch, tensor)) {
+        return tensor->type;
+    }
+    if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) {
+        return params->token_embedding_type;
+    }
+    if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) {
+        return params->output_tensor_type;
+    }
+
+    ggml_type new_type = default_type;
+
+    // get more optimal quantization type based on the tensor shape, layer, etc.
+    if (!params->pure && ggml_is_quantized(default_type)) {
+        // if the user provided tensor types - use those
+        bool manual = false;
+        if (!qs.tensor_type_patterns.empty()) {
+            const std::string tensor_name(tensor->name);
+            for (const auto & [pattern, qtype] : qs.tensor_type_patterns) {
+                if (std::regex_search(tensor_name, pattern)) {
+                    if (qtype != new_type) {
+                        LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        new_type = qtype;
+                        manual = true;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
+        if (!manual) {
+            new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category);
+        }
+
+        // incompatible tensor shapes are handled here - fallback to a compatible type
+        new_type = tensor_type_fallback(qs, tensor, new_type);
+    }
+
+    return new_type;
+}
+
+//
+// quantization implementation
+//
+
 static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
     if (nthread < 2) {
         // single-thread
@@ -479,61 +732,85 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float *
     return new_size;
 }
 
-static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) {
-    return (
-        dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
-        dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S  ||
-        dst_type == GGML_TYPE_IQ2_S   || dst_type == GGML_TYPE_IQ1_M  ||
-        (   // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings
-            dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0
-        )
-    );
+//
+// imatrix requirement check
+//
+
+static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) {
+    if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) {
+        return false;
+    }
+    switch (dst_type) {
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ2_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ1_S:
+            return true;
+        case GGML_TYPE_Q2_K:
+            // as a general rule, the k-type quantizations don't require imatrix data.
+            // the only exception is Q2_K tensors that are part of a Q2_K_S file.
+            return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S;
+        default:
+            return false;
+    }
 }
 
-static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
-    ggml_type default_type;
-    llama_ftype ftype = params->ftype;
+//
+// given a file type, get the default tensor type
+//
 
-    switch (params->ftype) {
-        case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break;
-        case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
-        case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
-        case LLAMA_FTYPE_MOSTLY_F16:  default_type = GGML_TYPE_F16;  break;
-        case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
-        case LLAMA_FTYPE_ALL_F32:     default_type = GGML_TYPE_F32;  break;
+static ggml_type llama_ftype_get_default_type(llama_ftype ftype) {
+    switch (ftype) {
+        case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0;
+        case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1;
+        case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0;
+        case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1;
+        case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0;
+        case LLAMA_FTYPE_MOSTLY_F16:  return GGML_TYPE_F16;
+        case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16;
+        case LLAMA_FTYPE_ALL_F32:     return GGML_TYPE_F32;
 
-        case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break;
+        case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4;
 
         // K-quants
         case LLAMA_FTYPE_MOSTLY_Q2_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q2_K:    default_type = GGML_TYPE_Q2_K;    break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XS:  default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q2_K:    return GGML_TYPE_Q2_K;
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:  return GGML_TYPE_IQ3_S;
         case LLAMA_FTYPE_MOSTLY_Q3_K_S:
         case LLAMA_FTYPE_MOSTLY_Q3_K_M:
-        case LLAMA_FTYPE_MOSTLY_Q3_K_L:  default_type = GGML_TYPE_Q3_K;    break;
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L:  return GGML_TYPE_Q3_K;
         case LLAMA_FTYPE_MOSTLY_Q4_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q4_K_M:  default_type = GGML_TYPE_Q4_K;    break;
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M:  return GGML_TYPE_Q4_K;
         case LLAMA_FTYPE_MOSTLY_Q5_K_S:
-        case LLAMA_FTYPE_MOSTLY_Q5_K_M:  default_type = GGML_TYPE_Q5_K;    break;
-        case LLAMA_FTYPE_MOSTLY_Q6_K:    default_type = GGML_TYPE_Q6_K;    break;
-        case LLAMA_FTYPE_MOSTLY_TQ1_0:   default_type = GGML_TYPE_TQ1_0;   break;
-        case LLAMA_FTYPE_MOSTLY_TQ2_0:   default_type = GGML_TYPE_TQ2_0;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  default_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_S:   default_type = GGML_TYPE_IQ2_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ2_M:   default_type = GGML_TYPE_IQ2_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break;
-        case LLAMA_FTYPE_MOSTLY_IQ1_S:   default_type = GGML_TYPE_IQ1_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ1_M:   default_type = GGML_TYPE_IQ1_M;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_NL:  default_type = GGML_TYPE_IQ4_NL;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M:  return GGML_TYPE_Q5_K;
+        case LLAMA_FTYPE_MOSTLY_Q6_K:    return GGML_TYPE_Q6_K;
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:   return GGML_TYPE_TQ1_0;
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:   return GGML_TYPE_TQ2_0;
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS;
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:  return GGML_TYPE_IQ2_XS;
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:   return GGML_TYPE_IQ2_XS;
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:   return GGML_TYPE_IQ2_S;
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS;
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:   return GGML_TYPE_IQ1_S;
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:   return GGML_TYPE_IQ1_M;
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:  return GGML_TYPE_IQ4_NL;
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:  return GGML_TYPE_IQ4_XS;
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:   return GGML_TYPE_IQ3_S;
 
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
+}
+
+//
+// main quantization driver
+//
+
+static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
+    ggml_type default_type;
+    llama_ftype ftype = params->ftype;
 
     int nthread = params->nthread;
 
@@ -541,6 +818,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         nthread = std::thread::hardware_concurrency();
     }
 
+    default_type = llama_ftype_get_default_type(ftype);
+
     // mmap consistently increases speed on Linux, and also increases speed on Windows with
     // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
 #if defined(__linux__) || defined(_WIN32)
@@ -567,6 +846,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     quantize_state_impl qs(model, params);
 
+    // these need to be set to n_layer by default
+    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
+
     if (params->only_copy) {
         ftype = ml.ftype;
     }
@@ -574,7 +856,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     if (params->imatrix) {
         imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (imatrix_data) {
-            LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+            LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
+                           __func__, (int)imatrix_data->size());
             qs.has_imatrix = true;
             // check imatrix for nans or infs
             for (const auto & kv : *imatrix_data) {
@@ -657,35 +940,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
 
-    for (const auto * it : tensors) {
-        const struct ggml_tensor * tensor = it->tensor;
-
-        const std::string name = ggml_get_name(tensor);
-
-        // TODO: avoid hardcoded tensor names - use the TN_* constants
-        if (name.find("attn_v.weight")   != std::string::npos ||
-            name.find("attn_qkv.weight") != std::string::npos ||
-            name.find("attn_kv_b.weight")!= std::string::npos) {
-            ++qs.n_attention_wv;
-        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
-            qs.has_output = true;
-        }
-    }
-
-    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-
-    std::vector<std::thread> workers;
-    workers.reserve(nthread);
-
     int idx = 0;
-
-    std::vector<no_init<uint8_t>> read_data;
-    std::vector<no_init<uint8_t>> work;
-    std::vector<no_init<float>> f32_conv_buf;
-
     uint16_t n_split = 1;
 
     // Assume split index is continuous
@@ -697,14 +952,62 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     std::vector<gguf_context_ptr> ctx_outs(n_split);
     ctx_outs[0] = std::move(ctx_out);
 
-    // populate the original tensors so we get an initial meta data
-    for (const auto * it : tensors) {
+    // compute tensor metadata once and cache it
+    std::vector<tensor_metadata> metadata(tensors.size());
+
+    // flag for --dry-run
+    bool will_require_imatrix = false;
+
+    //
+    // preliminary iteration over all weights
+    //
+
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        const auto * it = tensors[i];
+        const struct ggml_tensor * tensor = it->tensor;
+        const std::string name = ggml_get_name(tensor);
+
+        metadata[i].category = tensor_get_category(name);
+
+        if (category_is_attn_v(metadata[i].category)) {
+            ++qs.n_attention_wv;
+        }
+
+        if (tensor_name_match_output_weight(name.c_str())) {
+            qs.has_tied_embeddings = false;
+        }
+
         uint16_t i_split = params->keep_split ? it->idx : 0;
-        ggml_tensor * tensor = it->tensor;
         if (!ctx_outs[i_split]) {
             ctx_outs[i_split].reset(gguf_init_empty());
         }
         gguf_add_tensor(ctx_outs[i_split].get(), tensor);
+
+        metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor);
+
+        if (metadata[i].allows_quantization) {
+            metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]);
+        } else {
+            metadata[i].target_type = tensor->type;
+        }
+
+        metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype);
+
+        if (params->imatrix) {
+            metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped);
+        } else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) {
+            if (params->dry_run) {
+                will_require_imatrix = true;
+            } else {
+                LLAMA_LOG_ERROR("\n============================================================================\n"
+                                " ERROR: this quantization requires an importance matrix!\n"
+                                "        - offending tensor: %s\n"
+                                "        - target type: %s\n"
+                                "============================================================================\n\n",
+                                name.c_str(), ggml_type_name(metadata[i].target_type));
+                throw std::runtime_error("this quantization requires an imatrix!");
+            }
+        }
     }
 
     // Set split info if needed
@@ -716,6 +1019,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         }
     }
 
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<std::thread> workers;
+    workers.reserve(nthread);
+
+    std::vector<no_init<uint8_t>> read_data;
+    std::vector<no_init<uint8_t>> work;
+    std::vector<no_init<float>> f32_conv_buf;
+
     int cur_split = -1;
     std::ofstream fout;
     auto close_ofstream = [&]() {
@@ -745,20 +1058,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         ::zeros(fout, meta_size);
     };
 
-    const auto tn = LLM_TN(model.arch);
-
     // no output file for --dry-run
     if (!params->dry_run) {
         new_ofstream(0);
     }
 
-    // flag for `--dry-run`, to let the user know if imatrix will be required for a real
-    // quantization, as a courtesy
-    bool will_require_imatrix = false;
+    //
+    // main loop: iterate over all weights
+    //
 
-    for (const auto * it : tensors) {
-        const auto & weight = *it;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        const auto & weight = *tensors[i];
+        const auto & tm = metadata[i];
         ggml_tensor * tensor = weight.tensor;
+
         if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
             close_ofstream();
             new_ofstream(weight.idx);
@@ -783,156 +1096,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                llama_format_tensor_shape(tensor).c_str(),
                ggml_type_name(tensor->type));
 
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D and 3D tensors (experts)
-        quantize &= (ggml_n_dims(tensor) >= 2);
-
-        // do not quantize norm tensors
-        quantize &= name.find("_norm.weight") == std::string::npos;
-
-        quantize &= params->quantize_output_tensor || name != "output.weight";
-        quantize &= !params->only_copy;
-
-        // do not quantize expert gating tensors
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
-
-        // these are very small (e.g. 4x4)
-        quantize &= name.find("altup")  == std::string::npos;
-        quantize &= name.find("laurel") == std::string::npos;
-
-        // these are not too big so keep them as it is
-        quantize &= name.find("per_layer_model_proj") == std::string::npos;
-
-        // do not quantize positional embeddings and token types (BERT)
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
-        quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
-
-        // do not quantize Mamba /Kimi's small conv1d weights
-        // NOTE: can't use LLM_TN here because the layer number is not known
-        quantize &= name.find("ssm_conv1d") == std::string::npos;
-        quantize &= name.find("shortconv.conv.weight") == std::string::npos;
-
-        // do not quantize RWKV's small yet 2D weights
-        quantize &= name.find("time_mix_first.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_v2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a0.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_a2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_g2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
-        quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
-        quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos;
-
-        // do not quantize relative position bias (T5)
-        quantize &= name.find("attn_rel_b.weight") == std::string::npos;
-
-        // do not quantize specific multimodal tensors
-        quantize &= name.find(".position_embd.") == std::string::npos;
-
-        ggml_type new_type;
-        void * new_data;
-        size_t new_size;
+        const ggml_type cur_type = tensor->type;
+        const ggml_type new_type = tm.target_type;
 
-        if (quantize) {
-            new_type = default_type;
-
-            // get more optimal quantization type based on the tensor shape, layer, etc.
-            if (!params->pure && ggml_is_quantized(default_type)) {
-                // if the user provided tensor types - use those
-                bool manual = false;
-                if (params->tensor_types) {
-                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
-                    const std::string tensor_name(tensor->name);
-                    for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
-                            if  (qtype != new_type) {
-                                LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype));
-                                new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
-                                manual = true;
-                                break;
-                            }
-                        }
-                    }
-                }
+        // If we've decided to quantize to the same type the tensor is already
+        // in then there's nothing to do.
+        bool quantize = cur_type != new_type;
 
-                // if not manual - use the standard logic for choosing the quantization type based on the selected mixture
-                if (!manual) {
-                    new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                }
-
-                // incompatible tensor shapes are handled here - fallback to a compatible type
-                {
-                    bool convert_incompatible_tensor = false;
-
-                    const int64_t nx = tensor->ne[0];
-                    const int64_t ny = tensor->ne[1];
-                    const int64_t qk_k = ggml_blck_size(new_type);
-
-                    if (nx % qk_k != 0) {
-                        LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type));
-                        convert_incompatible_tensor = true;
-                    } else {
-                        ++qs.n_k_quantized;
-                    }
-
-                    if (convert_incompatible_tensor) {
-                        switch (new_type) {
-                            case GGML_TYPE_TQ1_0:
-                            case GGML_TYPE_TQ2_0:  new_type = GGML_TYPE_Q4_0; break;  // TODO: use a symmetric type instead
-                            case GGML_TYPE_IQ2_XXS:
-                            case GGML_TYPE_IQ2_XS:
-                            case GGML_TYPE_IQ2_S:
-                            case GGML_TYPE_IQ3_XXS:
-                            case GGML_TYPE_IQ3_S:
-                            case GGML_TYPE_IQ1_S:
-                            case GGML_TYPE_IQ1_M:
-                            case GGML_TYPE_Q2_K:
-                            case GGML_TYPE_Q3_K:
-                            case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
-                            case GGML_TYPE_Q4_K:   new_type = GGML_TYPE_Q5_0;   break;
-                            case GGML_TYPE_Q5_K:   new_type = GGML_TYPE_Q5_1;   break;
-                            case GGML_TYPE_Q6_K:   new_type = GGML_TYPE_Q8_0;   break;
-                            default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
-                        }
-                        if (tensor->ne[0] % ggml_blck_size(new_type) != 0) {
-                            new_type = GGML_TYPE_F16;
-                        }
-                        LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
-                        ++qs.n_fallback;
-                    }
-                }
-            }
-            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
-                new_type = params->token_embedding_type;
-            }
-            if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) {
-                new_type = params->output_tensor_type;
-            }
-
-            // If we've decided to quantize to the same type the tensor is already
-            // in then there's nothing to do.
-            quantize = tensor->type != new_type;
-        }
+        void * new_data;
+        size_t new_size;
 
-        // we have now decided on the target type for this tensor
         if (params->dry_run) {
-            // the --dry-run option calculates the final quantization size without quantizting
+            // the --dry-run option calculates the final quantization size without quantizing
             if (quantize) {
                 new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
                 LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
                                tensor_size/1024.0/1024.0,
                                new_size/1024.0/1024.0,
                                ggml_type_name(new_type));
-                if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+                if (!will_require_imatrix && tm.requires_imatrix) {
                     will_require_imatrix = true;
                 }
             } else {
@@ -945,7 +1127,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else {
             // no --dry-run, perform quantization
             if (!quantize) {
-                new_type = tensor->type;
                 new_data = tensor->data;
                 new_size = tensor_size;
                 LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
@@ -954,7 +1135,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
                 const float * imatrix = nullptr;
                 if (imatrix_data) {
-                    auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+                    auto it = imatrix_data->find(tm.remapped_imatrix_name);
                     if (it == imatrix_data->end()) {
                         LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
                     } else {
@@ -968,14 +1149,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                             // this is a significant error and it may be good idea to abort the process if this happens,
                             // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
                             // tok_embd should be ignored in this case, since it always causes this warning
-                            if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+                            if (!tensor_name_match_token_embd(tensor->name)) {
                                 throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
                                         int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
                             }
                         }
                     }
                 }
-                if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+                if (!imatrix && tm.requires_imatrix) {
                     LLAMA_LOG_ERROR("\n\n============================================================\n");
                     LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
                     LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
@@ -1020,29 +1201,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
 
                     new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
-
-                    // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
-#if 0
-                    if (new_type == GGML_TYPE_MXFP4) {
-                        auto * x = f32_data_03;
-
-                        //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
-                        std::vector<float> deq(nrows*n_per_row);
-                        const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
-                        qtype->to_float(new_data_03, deq.data(), deq.size());
-
-                        double err = 0.0f;
-                        for (int i = 0; i < (int) deq.size(); ++i) {
-                            err += fabsf(deq[i] - x[i]);
-                            //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
-                            if (deq[i] != x[i]) {
-                                LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
-                            }
-                        }
-                        //LLAMA_LOG_INFO("err = %f\n", err);
-                        GGML_ASSERT(err == 0.00000);
-                    }
-#endif
                 }
                 LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
             }
@@ -1058,7 +1216,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             fout.write((const char *) new_data, new_size);
             zeros(fout, GGML_PAD(new_size, align) - new_size);
         } // no --dry-run
-    } // iterate over tensors
+    } // main loop
 
     if (!params->dry_run) {
         close_ofstream();
@@ -1075,7 +1233,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     if (qs.n_fallback > 0) {
         LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
-                __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
+                __func__, qs.n_fallback, ml.n_tensors);
     }
 }
 
diff --git a/src/llama-quant.h b/src/llama-quant.h
index 6f70f09beec..a91ebffa37e 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1 +1,25 @@
 #pragma once
+#include <string>
+
+// Quantization types, used in both quantize.cpp and llama-quant.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
+// tensor categorization - used to avoid repeated string matching in quantization logic.
+// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
+enum class tensor_category {
+    TOKEN_EMBD,
+    ATTENTION_Q,
+    ATTENTION_V,
+    ATTENTION_K,
+    ATTENTION_QKV,
+    ATTENTION_KV_B,
+    ATTENTION_OUTPUT,
+    FFN_UP,
+    FFN_GATE,
+    FFN_DOWN,
+    OUTPUT,
+    OTHER
+};
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 59bf9bd3fd0..9a7e2fc1c7b 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -1,6 +1,7 @@
 #include "common.h"
 #include "llama.h"
 #include "gguf.h"
+#include "../src/llama-quant.h"
 
 #include <cstdio>
 #include <cstring>
@@ -61,12 +62,6 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
     { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
 };
 
-// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
@@ -686,18 +681,6 @@ int main(int argc, char ** argv) {
         }
     }
 
-    if (!params.dry_run &&
-        (
-            params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS  || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
-            params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S   || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S  ||
-            params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S   || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M
-        ) && imatrix_data.empty()) {
-        fprintf(stderr, "\n==========================================================================================================\n");
-        fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
-        fprintf(stderr, "==========================================================================================================\n\n\n");
-        return 1;
-    }
-
     if (!params.dry_run) {
         if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
             fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());

From d84833f69a57033736072c196b83913025d7af8a Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 4 Mar 2026 23:25:12 -0600
Subject: [PATCH 02/23] WIP

---
 src/llama-quant-scheduler.cpp | 116 ++++++++++++++++++++++++++++++++++
 src/llama-quant.cpp           |   6 +-
 src/llama-quant.h             |  21 ++++--
 tools/quantize/quantize.cpp   |  16 ++---
 4 files changed, 144 insertions(+), 15 deletions(-)
 create mode 100644 src/llama-quant-scheduler.cpp

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
new file mode 100644
index 00000000000..a0cf5116567
--- /dev/null
+++ b/src/llama-quant-scheduler.cpp
@@ -0,0 +1,116 @@
+/* llama-quant-scheduler.cpp -- C++17
+
+ASPIRATIONS
+-----------
+
+Whenever possible, we must overlap computation and disk I/O. In fact, disk I/O is the main
+bottleneck in very many cases, and currently on `master` it's not handled very well - computation
+never overlaps with I/O. There is a great opportunity to improve it!
+
+At the time of writing (2026-03-02), the code on `master` is kept simple (if a bit messy) and it
+simply does...
+
+    load src data -> (convert to f32) -> quantize to target type -> write tensor data
+
+...in a for loop over all tensors. I believe we may be able to acheive a speedup of ~4x in _some_
+cases by managing the work to be done more effectively. There are many people quantizing many models
+every day with untold billions of parameters - we don't want to leave any performance on the table.
+
+The quantized tensors MUST end up in order in the output GGUF.
+*/
+
+// #include "ggml-quants.h"
+#include "llama.h"
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-quant.h"
+
+#include <stdint.h>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <mutex>
+
+// pool of compute worker threads
+struct compute_pool {
+    const int32_t n_threads;
+    std::vector<std::thread> threads;
+    std::atomic_flag busy;
+
+    compute_pool(const int32_t _n_threads):
+        n_threads(_n_threads), threads(_n_threads) {
+        // TODO: prepare the threads? but don't start them.
+        // TODO: init `busy` atomic flag?
+    };
+
+    void start() {
+        // TODO: start the threads (but wait for work, don't spin)
+    };
+
+    void stop() {
+        // TODO: forcibly stop the thread pool (called should check `busy` before doing this)
+    };
+};
+
+//
+// quantization work scheduler
+//
+// goal: overlap I/O and computation, keep all threads busy (as much as reasonably possible)
+//
+// the scheduler actually manages (`n_threads` + 2) threads:
+// - 1 thread for the `read_worker`
+// - `n_thread` threads for the `thread_pool` (tensor math is divided among compute workers)
+// - 1 thread for the `write_worker`
+//
+struct scheduler {
+    const int32_t n_threads;
+
+    // metadata for all tensors in the model
+    std::vector<tensor_sched_data> tschd_vec;
+
+    //
+    // scheduling pipeline buffers (one of each at most)
+    //
+
+    // don't need this if using mmap
+    std::vector<uint8_t> buf_read; // size = largest tensor (as found) (`largest_tensor_size`)
+
+    // dequantization compute buffer
+    std::vector<float> buf_dequant; // size = largest tensor (as f32) (`largest_tensor_size_dequant`)
+
+    // quantization compute buffer
+    std::vector<uint8_t> buf_quant; // size = largest tensor (quantized) (`largest_tensor_size_quant`)
+
+    // hold tensor data (NOTE: tensors must be in order in the output file)
+    std::vector<uint8_t> buf_write; // size = largest tensor (quantized)
+
+    size_t largest_tensor_size         = 0;
+    size_t largest_tensor_size_dequant = 0;
+    size_t largest_tensor_size_quant   = 0;
+
+    compute_pool pool;
+
+    // initialize
+    scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _tschd_vec):
+        n_threads(_n_threads), tschd_vec(_tschd_vec), pool(_n_threads)
+    {
+        for (int32_t idx; idx < tschd_vec.size(); idx++) {
+        /*
+            TODO: set these:
+            largest_tensor_size         = ...;
+            largest_tensor_size_dequant = ...;
+            largest_tensor_size_quant   = ...;
+        */
+        }
+
+        // TODO: reserve pipeline buffers
+    };
+
+    void run() {
+        // TODO: start `read_worker` thread
+        // TODO: THIS thread should manage the compute pool
+        // TODO: start `write_worker` thread
+        // return void when done, throw std::runtime_error if something fails
+    }
+};
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 58ed0e9db7a..23b7585adae 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -167,9 +167,9 @@ struct quantize_state_impl {
     {
         // compile regex patterns once - they are expensive
         if (params->tensor_types) {
-            const auto & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
-            for (const auto & [tname, qtype] : tensor_types) {
-                tensor_type_patterns.emplace_back(std::regex(tname), qtype);
+            const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
+            for (const auto & [name, type] : tensor_types) {
+                tensor_type_patterns.emplace_back(std::regex(name), type);
             }
         }
     }
diff --git a/src/llama-quant.h b/src/llama-quant.h
index a91ebffa37e..053214a8aa5 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,10 +1,12 @@
 #pragma once
-#include <string>
+// #include <string>
+// #include <stdint.h>
+// #include "ggml.h"
 
-// Quantization types, used in both quantize.cpp and llama-quant.cpp
-struct tensor_quantization {
+// store result of parsing --tensor-type option
+struct tensor_type_option {
     std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
+    ggml_type   type = GGML_TYPE_COUNT;
 };
 
 // tensor categorization - used to avoid repeated string matching in quantization logic.
@@ -23,3 +25,14 @@ enum class tensor_category {
     OUTPUT,
     OTHER
 };
+
+// per-tensor info needed by the quantization work scheduler for efficient quantization.
+// constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise.
+struct tensor_sched_data {
+    const int64_t ne0; // ncols
+    const int64_t ne1; // nrows
+    const int64_t ne2; // n_expert (or any other 3rd dimension)
+    const int64_t ne3; // 4D (currently unused)
+    const ggml_type src_type;
+    const ggml_type dst_type;
+};
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 9a7e2fc1c7b..86c93d72978 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -404,7 +404,7 @@ static ggml_type parse_ggml_type(const char * arg) {
     return GGML_TYPE_COUNT;
 }
 
-static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
+static bool parse_tensor_type(const char * data, std::vector<tensor_type_option> & tensor_type) {
     const char * sep = strchr(data, '=');
     if (sep == nullptr) {
         printf("\n%s: malformed tensor type '%s'\n\n", __func__, data);
@@ -424,11 +424,11 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
     std::string tn(data, tn_len);
     std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
     sep++;
-    tensor_quantization tqz;
-    tqz.name = tn;
-    tqz.quant = parse_ggml_type(sep);
-    tensor_type.emplace_back(std::move(tqz));
-    if (tqz.quant == GGML_TYPE_COUNT) {
+    tensor_type_option type_opt;
+    type_opt.name = tn;
+    type_opt.type = parse_ggml_type(sep);
+    tensor_type.emplace_back(std::move(type_opt));
+    if (type_opt.type == GGML_TYPE_COUNT) {
         printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
         return false;
     }
@@ -436,7 +436,7 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
     return true;
 }
 
-static bool parse_tensor_type_file(const char * filename, std::vector<tensor_quantization> & tensor_type) {
+static bool parse_tensor_type_file(const char * filename, std::vector<tensor_type_option> & tensor_type) {
     std::ifstream file(filename);
     if (!file) {
         printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno));
@@ -490,7 +490,7 @@ int main(int argc, char ** argv) {
     std::string imatrix_file;
     std::vector<std::string> included_weights, excluded_weights;
     std::vector<llama_model_kv_override> kv_overrides;
-    std::vector<tensor_quantization> tensor_types;
+    std::vector<tensor_type_option> tensor_types;
     std::vector<int> prune_layers;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {

From 33b08083ef634c890d09ebf7a8bd4cf12d7abb7e Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 4 Mar 2026 23:27:41 -0600
Subject: [PATCH 03/23] remove comment

---
 src/llama-quant.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/llama-quant.h b/src/llama-quant.h
index 053214a8aa5..6dd116a3e55 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,7 +1,5 @@
 #pragma once
-// #include <string>
-// #include <stdint.h>
-// #include "ggml.h"
+#include <string>
 
 // store result of parsing --tensor-type option
 struct tensor_type_option {

From 7ff8ec6d87f4efb0a2b89745be1a0ce6ca9ddc1d Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 5 Mar 2026 00:47:42 -0600
Subject: [PATCH 04/23] WIP

---
 src/llama-quant-scheduler.cpp | 94 ++++++++++++++++++-----------------
 src/llama-quant.h             | 17 ++++---
 2 files changed, 58 insertions(+), 53 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index a0cf5116567..9d4be9df38c 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -1,23 +1,19 @@
-/* llama-quant-scheduler.cpp -- C++17
-
-ASPIRATIONS
------------
-
-Whenever possible, we must overlap computation and disk I/O. In fact, disk I/O is the main
-bottleneck in very many cases, and currently on `master` it's not handled very well - computation
-never overlaps with I/O. There is a great opportunity to improve it!
-
-At the time of writing (2026-03-02), the code on `master` is kept simple (if a bit messy) and it
-simply does...
-
-    load src data -> (convert to f32) -> quantize to target type -> write tensor data
-
-...in a for loop over all tensors. I believe we may be able to acheive a speedup of ~4x in _some_
-cases by managing the work to be done more effectively. There are many people quantizing many models
-every day with untold billions of parameters - we don't want to leave any performance on the table.
-
-The quantized tensors MUST end up in order in the output GGUF.
-*/
+/**
+ *
+ * Whenever possible, we aim to overlap computation and tensor data disk I/O.
+ *
+ * This is the primary bottleneck in very many cases, and currently it's not handled very
+ * efficiently - computation never overlaps with I/O on `master` at the time of writing. Rather,
+ * the code basically does:
+ *
+ *    load src tensor data -> dequantize and/or quantize -> write tensor data
+ *
+ * ...in a loop over all tensors. There is a great opportunity to improve it! I believe we may be
+ * able to acheive a speedup of ~4x in _some_ cases by overlapping the work to be done. There are
+ * many users quantizing many models with many billions of parameters - we don't want to leave any
+ * performance on the table.
+ *
+**/
 
 // #include "ggml-quants.h"
 #include "llama.h"
@@ -44,50 +40,46 @@ struct compute_pool {
         // TODO: init `busy` atomic flag?
     };
 
-    void start() {
-        // TODO: start the threads (but wait for work, don't spin)
-    };
-
-    void stop() {
-        // TODO: forcibly stop the thread pool (called should check `busy` before doing this)
+    bool distribute(tensor_sched_data & data) {
+        // TODO: distribute 
     };
 };
 
 //
 // quantization work scheduler
 //
-// goal: overlap I/O and computation, keep all threads busy (as much as reasonably possible)
+// goal: overlap I/O and computation as often as possible to speed-up the quantization process.
 //
-// the scheduler actually manages (`n_threads` + 2) threads:
+// the scheduler manages (`n_threads` + 2) threads:
 // - 1 thread for the `read_worker`
-// - `n_thread` threads for the `thread_pool` (tensor math is divided among compute workers)
+// - `n_threads` threads for the `compute_pool` (tensor math is divided among compute workers)
 // - 1 thread for the `write_worker`
 //
 struct scheduler {
     const int32_t n_threads;
 
-    // metadata for all tensors in the model
+    // per-tensor metadata for all tensors in the model
     std::vector<tensor_sched_data> tschd_vec;
 
+    size_t largest_tensor_size_src = 0; // size of largest tensor to be quantized (as src type)
+    size_t largest_tensor_size_f32 = 0; // size of largest tensor to be quantized (as f32)
+    size_t largest_tensor_size_dst = 0; // size of largest tensor to be quantized (as dst type)
+
     //
     // scheduling pipeline buffers (one of each at most)
     //
 
-    // don't need this if using mmap
-    std::vector<uint8_t> buf_read; // size = largest tensor (as found) (`largest_tensor_size`)
-
-    // dequantization compute buffer
-    std::vector<float> buf_dequant; // size = largest tensor (as f32) (`largest_tensor_size_dequant`)
+    // size: largest_tensor_size_src
+    std::vector<uint8_t> buf_read; // don't need this if using mmap?
 
-    // quantization compute buffer
-    std::vector<uint8_t> buf_quant; // size = largest tensor (quantized) (`largest_tensor_size_quant`)
+    // size: largest_tensor_size_f32
+    std::vector<float> buf_dequant; // dequantization buffer
 
-    // hold tensor data (NOTE: tensors must be in order in the output file)
-    std::vector<uint8_t> buf_write; // size = largest tensor (quantized)
+    // size: largest_tensor_size_dst
+    std::vector<uint8_t> buf_quant; // quantization buffer (do we really need this?)
 
-    size_t largest_tensor_size         = 0;
-    size_t largest_tensor_size_dequant = 0;
-    size_t largest_tensor_size_quant   = 0;
+    // size = largest tensor (as dst type)
+    std::vector<uint8_t> buf_write; // hold tensor data for writing (NOTE: tensors must be in order in the output file)
 
     compute_pool pool;
 
@@ -95,7 +87,7 @@ struct scheduler {
     scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _tschd_vec):
         n_threads(_n_threads), tschd_vec(_tschd_vec), pool(_n_threads)
     {
-        for (int32_t idx; idx < tschd_vec.size(); idx++) {
+        for (int32_t idx = 0; idx < tschd_vec.size(); idx++) {
         /*
             TODO: set these:
             largest_tensor_size         = ...;
@@ -104,13 +96,25 @@ struct scheduler {
         */
         }
 
-        // TODO: reserve pipeline buffers
+        // TODO: allocate pipeline buffers
     };
 
-    void run() {
+    ~scheduler() {
+        stop();
+    }
+
+    void start() {
         // TODO: start `read_worker` thread
         // TODO: THIS thread should manage the compute pool
         // TODO: start `write_worker` thread
         // return void when done, throw std::runtime_error if something fails
     }
+
+    void stop() {
+        // TODO: graceful shutdown + deallocation of buffers
+    }
+
+    void submit_compute(tensor_sched_data & tschd) {
+        // TODO: 
+    }
 };
diff --git a/src/llama-quant.h b/src/llama-quant.h
index 6dd116a3e55..8f58ec35c93 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,7 +1,7 @@
 #pragma once
 #include <string>
 
-// store result of parsing --tensor-type option
+// result of parsing --tensor-type option
 struct tensor_type_option {
     std::string name;
     ggml_type   type = GGML_TYPE_COUNT;
@@ -24,13 +24,14 @@ enum class tensor_category {
     OTHER
 };
 
-// per-tensor info needed by the quantization work scheduler for efficient quantization.
+// per-tensor info needed by the quantization work scheduler.
 // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise.
 struct tensor_sched_data {
-    const int64_t ne0; // ncols
-    const int64_t ne1; // nrows
-    const int64_t ne2; // n_expert (or any other 3rd dimension)
-    const int64_t ne3; // 4D (currently unused)
-    const ggml_type src_type;
-    const ggml_type dst_type;
+    const void * const src_data; // pointer to raw source tensor data, read-only
+    const ggml_type    src_type;
+    const ggml_type    dst_type;
+    const int64_t      ne0; // ncols
+    const int64_t      ne1; // nrows
+    const int64_t      ne2; // n_expert (or any 3rd tensor dimension)
+    const int64_t      ne3; // any 4th tensor dimension (currently unused, always 1)
 };

From 77b5a67a92654d6f0587f3447f189c3ee7c13ebb Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 5 Mar 2026 01:46:12 -0600
Subject: [PATCH 05/23] WIP

---
 src/llama-quant-scheduler.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 9d4be9df38c..ea228dd4c81 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -1,6 +1,7 @@
 /**
  *
- * Whenever possible, we aim to overlap computation and tensor data disk I/O.
+ * Whenever possible, we aim to overlap computation and tensor data disk I/O in the quantization
+ * process.
  *
  * This is the primary bottleneck in very many cases, and currently it's not handled very
  * efficiently - computation never overlaps with I/O on `master` at the time of writing. Rather,
@@ -35,7 +36,8 @@ struct compute_pool {
     std::atomic_flag busy;
 
     compute_pool(const int32_t _n_threads):
-        n_threads(_n_threads), threads(_n_threads) {
+        n_threads(_n_threads), threads(_n_threads)
+    {
         // TODO: prepare the threads? but don't start them.
         // TODO: init `busy` atomic flag?
     };
@@ -59,7 +61,7 @@ struct scheduler {
     const int32_t n_threads;
 
     // per-tensor metadata for all tensors in the model
-    std::vector<tensor_sched_data> tschd_vec;
+    std::vector<tensor_sched_data> data_vec;
 
     size_t largest_tensor_size_src = 0; // size of largest tensor to be quantized (as src type)
     size_t largest_tensor_size_f32 = 0; // size of largest tensor to be quantized (as f32)
@@ -84,10 +86,10 @@ struct scheduler {
     compute_pool pool;
 
     // initialize
-    scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _tschd_vec):
-        n_threads(_n_threads), tschd_vec(_tschd_vec), pool(_n_threads)
+    scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _data_vec):
+        n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads)
     {
-        for (int32_t idx = 0; idx < tschd_vec.size(); idx++) {
+        for (int32_t idx = 0; idx < data_vec.size(); idx++) {
         /*
             TODO: set these:
             largest_tensor_size         = ...;
@@ -113,8 +115,4 @@ struct scheduler {
     void stop() {
         // TODO: graceful shutdown + deallocation of buffers
     }
-
-    void submit_compute(tensor_sched_data & tschd) {
-        // TODO: 
-    }
 };

From d7964472bccf9d19908c758471a6b178728f5cf1 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 5 Mar 2026 16:42:38 -0600
Subject: [PATCH 06/23] set buffer sizes

---
 src/llama-quant-scheduler.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index ea228dd4c81..6d8fcda797c 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -63,18 +63,18 @@ struct scheduler {
     // per-tensor metadata for all tensors in the model
     std::vector<tensor_sched_data> data_vec;
 
-    size_t largest_tensor_size_src = 0; // size of largest tensor to be quantized (as src type)
-    size_t largest_tensor_size_f32 = 0; // size of largest tensor to be quantized (as f32)
-    size_t largest_tensor_size_dst = 0; // size of largest tensor to be quantized (as dst type)
+    size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type)
+    size_t max_f32_sz = 0; // size of largest tensor to be quantized (as f32)
+    size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type)
 
     //
     // scheduling pipeline buffers (one of each at most)
     //
 
-    // size: largest_tensor_size_src
+    // size: max_tensor_size_src
     std::vector<uint8_t> buf_read; // don't need this if using mmap?
 
-    // size: largest_tensor_size_f32
+    // size: max_tensor_size_f32
     std::vector<float> buf_dequant; // dequantization buffer
 
     // size: largest_tensor_size_dst
@@ -89,13 +89,13 @@ struct scheduler {
     scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _data_vec):
         n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads)
     {
-        for (int32_t idx = 0; idx < data_vec.size(); idx++) {
-        /*
-            TODO: set these:
-            largest_tensor_size         = ...;
-            largest_tensor_size_dequant = ...;
-            largest_tensor_size_quant   = ...;
-        */
+        GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function");
+        for (int32_t idx = 0; idx < data_vec.size(); ++idx) {
+            const auto & data = data_vec[idx];
+            const int64_t nrows = data.ne1 * data.ne2 * data.ne3;
+            max_src_sz = std::max(max_src_sz, nrows * ggml_row_size(data.src_type, data.ne0));
+            max_f32_sz = std::max(max_f32_sz, nrows * ggml_row_size(GGML_TYPE_F32, data.ne0));
+            max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0));
         }
 
         // TODO: allocate pipeline buffers

From e382e661cddff5907344248662f7c37af3f5a60f Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Fri, 6 Mar 2026 09:25:27 -0600
Subject: [PATCH 07/23] WIP

---
 src/llama-quant-scheduler.cpp | 41 +++++++++++++++--------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 6d8fcda797c..0d31de9dda2 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -29,7 +29,7 @@
 #include <atomic>
 #include <mutex>
 
-// pool of compute worker threads
+// pool of worker threads used for dequantization and quantization
 struct compute_pool {
     const int32_t n_threads;
     std::vector<std::thread> threads;
@@ -43,18 +43,18 @@ struct compute_pool {
     };
 
     bool distribute(tensor_sched_data & data) {
-        // TODO: distribute 
+        // TODO: distribute
     };
 };
 
 //
 // quantization work scheduler
 //
-// goal: overlap I/O and computation as often as possible to speed-up the quantization process.
+// goal: overlap I/O and computation as much as possible to speed up the quantization process.
 //
 // the scheduler manages (`n_threads` + 2) threads:
 // - 1 thread for the `read_worker`
-// - `n_threads` threads for the `compute_pool` (tensor math is divided among compute workers)
+// - `n_threads` threads for the `compute_pool`
 // - 1 thread for the `write_worker`
 //
 struct scheduler {
@@ -64,28 +64,23 @@ struct scheduler {
     std::vector<tensor_sched_data> data_vec;
 
     size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type)
-    size_t max_f32_sz = 0; // size of largest tensor to be quantized (as f32)
+    size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32)
     size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type)
 
     //
-    // scheduling pipeline buffers (one of each at most)
+    // scheduler pipeline buffers (one of each at most)
     //
 
-    // size: max_tensor_size_src
-    std::vector<uint8_t> buf_read; // don't need this if using mmap?
-
-    // size: max_tensor_size_f32
-    std::vector<float> buf_dequant; // dequantization buffer
-
-    // size: largest_tensor_size_dst
-    std::vector<uint8_t> buf_quant; // quantization buffer (do we really need this?)
-
-    // size = largest tensor (as dst type)
-    std::vector<uint8_t> buf_write; // hold tensor data for writing (NOTE: tensors must be in order in the output file)
+    // size: max_src_sz
+    std::vector<uint8_t> buf_read;    // don't need this if using mmap?
+    // size: max_f32_sz
+    std::vector<float>   buf_compute; // dequant/quant buffer
+    // size = max_dst_sz
+    std::vector<uint8_t> buf_write;   // hold tensor data for writing (NOTE: tensors must be in order in the output file)
 
     compute_pool pool;
 
-    // initialize
+    // init
     scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _data_vec):
         n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads)
     {
@@ -101,18 +96,18 @@ struct scheduler {
         // TODO: allocate pipeline buffers
     };
 
-    ~scheduler() {
-        stop();
-    }
-
     void start() {
         // TODO: start `read_worker` thread
         // TODO: THIS thread should manage the compute pool
         // TODO: start `write_worker` thread
-        // return void when done, throw std::runtime_error if something fails
+        // throw std::runtime_error if something fails
     }
 
     void stop() {
         // TODO: graceful shutdown + deallocation of buffers
     }
+
+    ~scheduler() {
+        stop();
+    }
 };

From a4d4aab3aa9a1a696e19fcc4461bf36b4e3ac118 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Fri, 6 Mar 2026 16:31:03 -0600
Subject: [PATCH 08/23] WIP

---
 src/llama-quant-scheduler.cpp | 34 +++++++++++++++++++++++++++++-----
 src/llama-quant.h             | 15 ++++++++-------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 0d31de9dda2..718738f2127 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -29,6 +29,14 @@
 #include <atomic>
 #include <mutex>
 
+static int get_split_dimension(const tensor_sched_data & tsd, const int32_t n_threads) {
+    if (tsd.ne0 % n_threads) return 0;
+    if (tsd.ne1 % n_threads) return 1;
+    if (tsd.ne2 % n_threads) return 2;
+    if (tsd.ne3 % n_threads) return 3;
+    return -1;
+}
+
 // pool of worker threads used for dequantization and quantization
 struct compute_pool {
     const int32_t n_threads;
@@ -63,9 +71,9 @@ struct scheduler {
     // per-tensor metadata for all tensors in the model
     std::vector<tensor_sched_data> data_vec;
 
-    size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type)
-    size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32)
-    size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type)
+    size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) in bytes
+    size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) in bytes
+    size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) in bytes
 
     //
     // scheduler pipeline buffers (one of each at most)
@@ -93,7 +101,17 @@ struct scheduler {
             max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0));
         }
 
-        // TODO: allocate pipeline buffers
+        LLAMA_LOG_DEBUG("%s:    allocating read buffer ... ", __func__);
+        buf_read.resize(max_src_sz);
+        LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0);
+
+        LLAMA_LOG_DEBUG("%s: allocating compute buffer ... ", __func__);
+        buf_compute.resize(max_f32_sz);
+        LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0);
+
+        LLAMA_LOG_DEBUG("%s:   allocating write buffer ... ", __func__);
+        buf_write.resize(max_dst_sz);
+        LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0);
     };
 
     void start() {
@@ -104,7 +122,13 @@ struct scheduler {
     }
 
     void stop() {
-        // TODO: graceful shutdown + deallocation of buffers
+        LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__);
+
+        buf_read.clear();
+        buf_compute.clear();
+        buf_write.clear();
+
+        LLAMA_LOG_DEBUG("done\n");
     }
 
     ~scheduler() {
diff --git a/src/llama-quant.h b/src/llama-quant.h
index 8f58ec35c93..f75a812d44d 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -27,11 +27,12 @@ enum class tensor_category {
 // per-tensor info needed by the quantization work scheduler.
 // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise.
 struct tensor_sched_data {
-    const void * const src_data; // pointer to raw source tensor data, read-only
-    const ggml_type    src_type;
-    const ggml_type    dst_type;
-    const int64_t      ne0; // ncols
-    const int64_t      ne1; // nrows
-    const int64_t      ne2; // n_expert (or any 3rd tensor dimension)
-    const int64_t      ne3; // any 4th tensor dimension (currently unused, always 1)
+    const ggml_type    src_type; // source tensor type
+    const ggml_type    dst_type; // destination tensor type
+    const int64_t      ne0;      // n_cols
+    const int64_t      ne1;      // n_rows
+    const int64_t      ne2;      // n_expert (or any 3rd tensor dimension)
+    const int64_t      ne3;      // any 4th tensor dimension (currently unused, always 1)
+    const void * const src_data; // pointer to raw source tensor data buffer, read-only
+    const void * const imatrix;  // pointer to imatrix data, or nullptr, read-only
 };

From 956092ddd584bfc52f2da306998ee9d10d3c53ce Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Fri, 6 Mar 2026 22:57:22 -0600
Subject: [PATCH 09/23] WIP

---
 src/llama-quant-scheduler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 718738f2127..e1e91f6e22b 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -80,7 +80,7 @@ struct scheduler {
     //
 
     // size: max_src_sz
-    std::vector<uint8_t> buf_read;    // don't need this if using mmap?
+    std::vector<uint8_t> buf_read;    // hold tensor data for reading
     // size: max_f32_sz
     std::vector<float>   buf_compute; // dequant/quant buffer
     // size = max_dst_sz

From 1e1b692316bbd1501265ab1ed484a66b2427c2d4 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sat, 7 Mar 2026 16:21:57 -0600
Subject: [PATCH 10/23] WIP

---
 src/llama-quant-scheduler.cpp | 141 ++++++++++++++++++++++++++--------
 1 file changed, 109 insertions(+), 32 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index e1e91f6e22b..7c8ae01db23 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -1,21 +1,70 @@
 /**
  *
- * Whenever possible, we aim to overlap computation and tensor data disk I/O in the quantization
+ * Whenever possible, we aim to overlap computation and tensor data I/O during the quantization
  * process.
  *
- * This is the primary bottleneck in very many cases, and currently it's not handled very
- * efficiently - computation never overlaps with I/O on `master` at the time of writing. Rather,
- * the code basically does:
+ * This is the primary bottleneck in very many cases, and at the time of writing (2026-03) it's not
+ * handled very efficiently on `master` - computation never overlaps with I/O. Rather, the code
+ * essentially does:
  *
- *    load src tensor data -> dequantize and/or quantize -> write tensor data
+ *    read src tensor data -> dequantize and/or quantize -> write dst tensor data
  *
- * ...in a loop over all tensors. There is a great opportunity to improve it! I believe we may be
- * able to acheive a speedup of ~4x in _some_ cases by overlapping the work to be done. There are
- * many users quantizing many models with many billions of parameters - we don't want to leave any
- * performance on the table.
+ * ...in a synchronous loop over all tensors. There is a great opportunity to improve it! I believe
+ * we may be able to acheive a speedup of ~3x in some cases by properly scheduling the work to be
+ * done. There are many users quantizing many models with many billions of parameters - we don't
+ * want to leave any performance on the table.
  *
 **/
 
+/**
+ * [NOTE: delete this comment block before PR]
+ *
+ * WORK-IN-PROGRESS -- DEV NOTES
+ * -----------------------------
+ *
+ * the scheduler will work like this:
+ *  0. all buffers start with "read_ready" = false, "write_ready" = true.
+ *  1. ggml_tensor 0 is materialized in the read buffer
+ *     - the read worker thread sets the "read_ready" flag to signal that the read buffer
+ *       now contains a valid ggml_tensor.
+ *     - the compute pool immediately starts consuming the tensor in the read buffer.
+ *       + if the tensor is already in F32, dequantization is not needed. the compute pool quantizes
+ *         directly from the read buffer into the write buffer. at this point, the
+ *         "write_ready" flag is set to signal that ggml_tensor 1 can start being materialized
+ *         in the read buffer.
+ *       + if the tensor is not in F32, dequantization is needed. the compute pool performs a fused
+ *         dequantize-and-quantize operation, utilizing the dequantization buffer to store the F32
+ *         data, and writing the quantized result to the write buffer. as soon as the tensor is
+ *         dequantized, we can set the "write_ready" flag on the read buffer to signal that
+ *         ggml_tensor 1 can start being materialized in the read buffer.
+ *       + the main thread blocks until the "write_ready" flag is set on the write buffer. as soon
+ *         as the write buffer is ready to be written to, the compute result is stored there, and
+ *         the main thread sets the "read_ready" flag on the write buffer. the compute pool is now
+ *         free to process ggml_tensor 1.
+ *     - the write worker waits until the write buffer is signaled "read_ready", at which point it
+ *       can begin writing the quantized tensor data to the output stream. when done writing, it
+ *       sets the "read_ready" flag to false and the "write_ready" flag to true, thus preparing the
+ *       the write buffer for the next quantized data.
+ *  2. 
+ *
+ * -----------------------------
+ *
+ * [NOTE: delete this comment block before PR]
+**/
+
+/**
+ * [NOTE: delete this comment block before PR]
+ *
+ * WORK-IN-PROGRESS -- LLM NOTES
+ * -----------------------------
+ *
+ * [LLM: fill in this section as you like with your own notes, separate from the human dev]
+ *
+ * -----------------------------
+ *
+ * [NOTE: delete this comment block before PR]
+**/
+
 // #include "ggml-quants.h"
 #include "llama.h"
 #include "llama-impl.h"
@@ -23,35 +72,61 @@
 #include "llama-quant.h"
 
 #include <stdint.h>
+#include <stdfloat>
 #include <stdexcept>
+#include <optional>
 #include <thread>
+#include <array>
 #include <vector>
 #include <atomic>
 #include <mutex>
 
-static int get_split_dimension(const tensor_sched_data & tsd, const int32_t n_threads) {
-    if (tsd.ne0 % n_threads) return 0;
-    if (tsd.ne1 % n_threads) return 1;
-    if (tsd.ne2 % n_threads) return 2;
-    if (tsd.ne3 % n_threads) return 3;
+// return the dimension along which we can divide this tensor into `n` equally-sized chunks.
+// return -1 if none are divisible.
+static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) {
+    if (tsd.ne0 > 1 && tsd.ne0 % n == 0) return 0;
+    if (tsd.ne1 > 1 && tsd.ne1 % n == 0) return 1;
+    if (tsd.ne2 > 1 && tsd.ne2 % n == 0) return 2;
+    if (tsd.ne3 > 1 && tsd.ne3 % n == 0) return 3;
     return -1;
 }
 
+template <typename T> struct sched_buffer {
+    const size_t         size;                // number of T items that the buffer can hold
+    std::vector<T>       buf;                 // the buffer
+    std::atomic<bool>    write_ready = true;  // is this buffer ready to be written to?
+    std::atomic<bool>    read_ready  = false; // is this buffer ready to be read from?
+    std::atomic<int64_t> idx         = -1;    // which tensor is currently / most recently stored?
+    // init
+    sched_buffer(const size_t _size): size(_size), buf(_size) {};
+    // reset
+    void reset() {
+        buf.clear();
+        write_ready = true;
+        read_ready = false;
+        idx = -1;
+    };
+    // destruct
+    ~sched_buffer() {
+        buf.clear();
+        // TODO: is more needed here?
+    };
+};
+
 // pool of worker threads used for dequantization and quantization
 struct compute_pool {
     const int32_t n_threads;
     std::vector<std::thread> threads;
-    std::atomic_flag busy;
+    std::atomic<bool> busy;
 
     compute_pool(const int32_t _n_threads):
         n_threads(_n_threads), threads(_n_threads)
-    {
-        // TODO: prepare the threads? but don't start them.
-        // TODO: init `busy` atomic flag?
-    };
+    {};
 
-    bool distribute(tensor_sched_data & data) {
-        // TODO: distribute
+    // distribute the work for this tensor among the compute threads.
+    // return an exception, if one occured during computation.
+    std::optional<std::exception> distribute(tensor_sched_data & data) {
+        // TODO
     };
 };
 
@@ -80,17 +155,19 @@ struct scheduler {
     //
 
     // size: max_src_sz
-    std::vector<uint8_t> buf_read;    // hold tensor data for reading
+    sched_buffer<uint8_t> buf_read;    // hold source tensor data for reading 
     // size: max_f32_sz
-    std::vector<float>   buf_compute; // dequant/quant buffer
+    sched_buffer<float>   buf_dequant; // hold dequantized tensor data
     // size = max_dst_sz
-    std::vector<uint8_t> buf_write;   // hold tensor data for writing (NOTE: tensors must be in order in the output file)
+    sched_buffer<uint8_t> buf_write;   // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file)
 
     compute_pool pool;
 
     // init
     scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _data_vec):
-        n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads)
+        n_threads(_n_threads),
+        data_vec(_data_vec),
+        pool(_n_threads)
     {
         GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function");
         for (int32_t idx = 0; idx < data_vec.size(); ++idx) {
@@ -101,16 +178,16 @@ struct scheduler {
             max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0));
         }
 
-        LLAMA_LOG_DEBUG("%s:    allocating read buffer ... ", __func__);
-        buf_read.resize(max_src_sz);
+        LLAMA_LOG_DEBUG("%s:           allocating read buffer ... ", __func__);
+        buf_read(max_src_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0);
 
-        LLAMA_LOG_DEBUG("%s: allocating compute buffer ... ", __func__);
-        buf_compute.resize(max_f32_sz);
+        LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__);
+        buf_dequant(max_f32_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0);
 
-        LLAMA_LOG_DEBUG("%s:   allocating write buffer ... ", __func__);
-        buf_write.resize(max_dst_sz);
+        LLAMA_LOG_DEBUG("%s:          allocating write buffer ... ", __func__);
+        buf_write(max_dst_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0);
     };
 
@@ -125,7 +202,7 @@ struct scheduler {
         LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__);
 
         buf_read.clear();
-        buf_compute.clear();
+        buf_dequant.clear();
         buf_write.clear();
 
         LLAMA_LOG_DEBUG("done\n");

From 6f78f2b5db81a823946abad0b80162351b46c213 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sat, 7 Mar 2026 22:52:21 -0600
Subject: [PATCH 11/23] WIP

---
 src/llama-quant-scheduler.cpp | 56 +++++++++++++++++++----------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 7c8ae01db23..c4867dd1fe2 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -91,40 +91,47 @@ static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) {
     return -1;
 }
 
-template <typename T> struct sched_buffer {
-    const size_t         size;                // number of T items that the buffer can hold
-    std::vector<T>       buf;                 // the buffer
-    std::atomic<bool>    write_ready = true;  // is this buffer ready to be written to?
-    std::atomic<bool>    read_ready  = false; // is this buffer ready to be read from?
-    std::atomic<int64_t> idx         = -1;    // which tensor is currently / most recently stored?
-    // init
-    sched_buffer(const size_t _size): size(_size), buf(_size) {};
-    // reset
+template <typename T>
+struct sched_buffer {
+    size_t size;
+    std::vector<T> buf;
+    std::atomic<bool> write_ready;
+    std::atomic<bool> read_ready;
+    std::atomic<int64_t> idx;
+
+    sched_buffer() : size(0), buf(), write_ready(true), read_ready(false), idx(-1) {}
+
+    void init(const size_t _size) {
+        size = _size;
+        buf = std::vector<T>(_size);
+        write_ready = true;
+        read_ready = false;
+        idx = -1;
+    }
+
     void reset() {
         buf.clear();
         write_ready = true;
         read_ready = false;
         idx = -1;
     };
-    // destruct
-    ~sched_buffer() {
-        buf.clear();
-        // TODO: is more needed here?
-    };
+
+    ~sched_buffer() = default;
 };
 
-// pool of worker threads used for dequantization and quantization
+// pool of worker threads used for dequantization + quantization
 struct compute_pool {
     const int32_t n_threads;
     std::vector<std::thread> threads;
     std::atomic<bool> busy;
+    std::optional<std::exception> opt_exc;
 
     compute_pool(const int32_t _n_threads):
         n_threads(_n_threads), threads(_n_threads)
     {};
 
-    // distribute the work for this tensor among the compute threads.
-    // return an exception, if one occured during computation.
+    // distribute the computation to all worker threads.
+    // return an exception, if one occured during computation, nullopt otherwise.
     std::optional<std::exception> distribute(tensor_sched_data & data) {
         // TODO
     };
@@ -133,7 +140,8 @@ struct compute_pool {
 //
 // quantization work scheduler
 //
-// goal: overlap I/O and computation as much as possible to speed up the quantization process.
+// goal: overlap I/O and computation as much as possible to speed up the quantization process,
+//       while still being mindful of total memory usage.
 //
 // the scheduler manages (`n_threads` + 2) threads:
 // - 1 thread for the `read_worker`
@@ -179,16 +187,17 @@ struct scheduler {
         }
 
         LLAMA_LOG_DEBUG("%s:           allocating read buffer ... ", __func__);
-        buf_read(max_src_sz);
+        buf_read.init(max_src_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0);
 
         LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__);
-        buf_dequant(max_f32_sz);
+        buf_dequant.init(max_f32_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0);
 
         LLAMA_LOG_DEBUG("%s:          allocating write buffer ... ", __func__);
-        buf_write(max_dst_sz);
+        buf_write.init(max_dst_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0);
+
     };
 
     void start() {
@@ -200,11 +209,6 @@ struct scheduler {
 
     void stop() {
         LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__);
-
-        buf_read.clear();
-        buf_dequant.clear();
-        buf_write.clear();
-
         LLAMA_LOG_DEBUG("done\n");
     }
 

From 1896fede20e6f527b0aba499ee5bbbf431f879e7 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sun, 8 Mar 2026 01:08:34 -0600
Subject: [PATCH 12/23] WIP

---
 src/llama-quant-scheduler.cpp | 146 ++++++++++++++++++++--------------
 1 file changed, 87 insertions(+), 59 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index c4867dd1fe2..86e50bd225a 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -60,6 +60,30 @@
  *
  * [LLM: fill in this section as you like with your own notes, separate from the human dev]
  *
+ * ARCHITECTURE REVIEW & RECOMMENDATIONS
+ * -------------------------------------
+ * 1. Synchronization Primitives:
+ *    - Replace atomic bool polling with std::condition_variable to prevent busy-waiting.
+ *    - Ensure memory_order_acquire/release is used if sticking with atomics for flags.
+ *
+ * 2. Buffering Strategy:
+ *    - Current single-buffer design couples I/O and Compute latency.
+ *    - Recommendation: Implement double-buffering (ping-pong) for 'buf_read' to allow
+ *      loading Tensor N+1 while Computing Tensor N.
+ *
+ * 3. Exception Handling:
+ *    - Change compute_pool::opt_exc from std::optional<std::exception> to 
+ *      std::optional<std::exception_ptr> to avoid object slicing.
+ *    - Add a global 'stop_flag' to the scheduler to terminate all workers if one fails.
+ *
+ * 4. Compatibility:
+ *    - Remove <stdfloat> include. llama.cpp targets C++17; std::float_t is C++23.
+ *    - Remove std::optional wrappers on buffers; they are always initialized.
+ *
+ * 5. Thread Pool:
+ *    - compute_pool constructor must launch worker threads with a wait-loop, 
+ *      not just resize the vector.
+ *
  * -----------------------------
  *
  * [NOTE: delete this comment block before PR]
@@ -72,18 +96,17 @@
 #include "llama-quant.h"
 
 #include <stdint.h>
-#include <stdfloat>
 #include <stdexcept>
+#include <condition_variable>
 #include <optional>
 #include <thread>
-#include <array>
 #include <vector>
 #include <atomic>
 #include <mutex>
 
-// return the dimension along which we can divide this tensor into `n` equally-sized chunks.
-// return -1 if none are divisible.
-static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) {
+// determine the dimension along which we can divide this tensor into `n` equally-sized chunks.
+// return 0, 1, 2, or 3. if none are divisible, return -1.
+static int get_split_dim(const tensor_sched_data & tsd, const int64_t n) {
     if (tsd.ne0 > 1 && tsd.ne0 % n == 0) return 0;
     if (tsd.ne1 > 1 && tsd.ne1 % n == 0) return 1;
     if (tsd.ne2 > 1 && tsd.ne2 % n == 0) return 2;
@@ -93,46 +116,56 @@ static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) {
 
 template <typename T>
 struct sched_buffer {
-    size_t size;
-    std::vector<T> buf;
-    std::atomic<bool> write_ready;
-    std::atomic<bool> read_ready;
-    std::atomic<int64_t> idx;
-
-    sched_buffer() : size(0), buf(), write_ready(true), read_ready(false), idx(-1) {}
-
-    void init(const size_t _size) {
-        size = _size;
-        buf = std::vector<T>(_size);
-        write_ready = true;
-        read_ready = false;
-        idx = -1;
+    static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, float>,
+                  "sched_buffer<T> only supports uint8_t and float");
+
+    std::vector<T>          buf;
+    std::mutex              mtx;
+    std::atomic<int64_t>    idx; // which tensor is currently / most recently stored? (-1 if none)
+    std::condition_variable cv;
+    std::atomic<bool>       has_data;
+
+    sched_buffer(const size_t _size): buf(_size), has_data(false), idx(-1) {}
+
+    // producer calls this when data is written
+    void notify_ready() {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+            has_data = true;
+        }
+        cv.notify_one();
     }
 
-    void reset() {
-        buf.clear();
-        write_ready = true;
-        read_ready = false;
-        idx = -1;
-    };
+    // consumer calls this to wait for data
+    void wait_ready() {
+        std::unique_lock<std::mutex> lock(mtx);
+        cv.wait(lock, [this]{ return has_data; });
+    }
 
-    ~sched_buffer() = default;
+    // consumer calls this when done processing to release buffer
+    void release() {
+        {
+            std::lock_guard<std::mutex> lock(mtx);
+            has_data = false;
+        }
+        cv.notify_one();
+    }
 };
 
 // pool of worker threads used for dequantization + quantization
 struct compute_pool {
-    const int32_t n_threads;
+    const int32_t            n_threads;
     std::vector<std::thread> threads;
-    std::atomic<bool> busy;
-    std::optional<std::exception> opt_exc;
+    std::atomic<bool>        busy;
 
     compute_pool(const int32_t _n_threads):
-        n_threads(_n_threads), threads(_n_threads)
-    {};
+        n_threads(_n_threads), threads(_n_threads), busy(false)
+    {
+        // TODO: do we need to init the threads, or can this be left empty?
+    };
 
     // distribute the computation to all worker threads.
-    // return an exception, if one occured during computation, nullopt otherwise.
-    std::optional<std::exception> distribute(tensor_sched_data & data) {
+    void distribute(tensor_sched_data & data) const {
         // TODO
     };
 };
@@ -141,7 +174,7 @@ struct compute_pool {
 // quantization work scheduler
 //
 // goal: overlap I/O and computation as much as possible to speed up the quantization process,
-//       while still being mindful of total memory usage.
+//       while being mindful of total memory usage.
 //
 // the scheduler manages (`n_threads` + 2) threads:
 // - 1 thread for the `read_worker`
@@ -151,35 +184,37 @@ struct compute_pool {
 struct scheduler {
     const int32_t n_threads;
 
-    // per-tensor metadata for all tensors in the model
-    std::vector<tensor_sched_data> data_vec;
+    // per-tensor data needed by the scheduler for all model tensors
+    std::vector<tensor_sched_data> tsd_vec;
 
-    size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) in bytes
-    size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) in bytes
-    size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) in bytes
+    size_t max_src_sz; // size of largest tensor to be quantized (as src type) in bytes
+    size_t max_f32_sz; // size of largest tensor to be quantized (as float32) in bytes
+    size_t max_dst_sz; // size of largest tensor to be quantized (as dst type) in bytes
 
     //
     // scheduler pipeline buffers (one of each at most)
     //
 
     // size: max_src_sz
-    sched_buffer<uint8_t> buf_read;    // hold source tensor data for reading 
+    std::optional<sched_buffer<uint8_t>> buf_read;    // hold source tensor data for reading 
     // size: max_f32_sz
-    sched_buffer<float>   buf_dequant; // hold dequantized tensor data
+    std::optional<sched_buffer<float>>   buf_dequant; // hold dequantized tensor data
     // size = max_dst_sz
-    sched_buffer<uint8_t> buf_write;   // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file)
+    std::optional<sched_buffer<uint8_t>> buf_write;   // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file)
 
     compute_pool pool;
 
     // init
-    scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _data_vec):
+    scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _tsd_vec):
         n_threads(_n_threads),
-        data_vec(_data_vec),
+        tsd_vec(_tsd_vec),
+        max_src_sz(0), max_f32_sz(0), max_dst_sz(0),
+        buf_read(std::nullopt), buf_dequant(std::nullopt), buf_write(std::nullopt),
         pool(_n_threads)
     {
         GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function");
-        for (int32_t idx = 0; idx < data_vec.size(); ++idx) {
-            const auto & data = data_vec[idx];
+        for (int32_t idx = 0; idx < tsd_vec.size(); ++idx) {
+            const auto & data = tsd_vec[idx];
             const int64_t nrows = data.ne1 * data.ne2 * data.ne3;
             max_src_sz = std::max(max_src_sz, nrows * ggml_row_size(data.src_type, data.ne0));
             max_f32_sz = std::max(max_f32_sz, nrows * ggml_row_size(GGML_TYPE_F32, data.ne0));
@@ -187,32 +222,25 @@ struct scheduler {
         }
 
         LLAMA_LOG_DEBUG("%s:           allocating read buffer ... ", __func__);
-        buf_read.init(max_src_sz);
+        buf_read.emplace(max_src_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0);
 
         LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__);
-        buf_dequant.init(max_f32_sz);
+        buf_dequant.emplace(max_f32_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0);
 
         LLAMA_LOG_DEBUG("%s:          allocating write buffer ... ", __func__);
-        buf_write.init(max_dst_sz);
+        buf_write.emplace(max_dst_sz);
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0);
 
-    };
+    }
 
-    void start() {
+    void run() {
         // TODO: start `read_worker` thread
         // TODO: THIS thread should manage the compute pool
         // TODO: start `write_worker` thread
         // throw std::runtime_error if something fails
     }
 
-    void stop() {
-        LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__);
-        LLAMA_LOG_DEBUG("done\n");
-    }
-
-    ~scheduler() {
-        stop();
-    }
+    ~scheduler() = default;
 };

From 3c046f941075d04c8ea2a3aa8ae50b3b2b60e567 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sun, 8 Mar 2026 01:23:59 -0600
Subject: [PATCH 13/23] WIP

---
 src/llama-quant-scheduler.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 86e50bd225a..11eafc68dcd 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -89,7 +89,6 @@
  * [NOTE: delete this comment block before PR]
 **/
 
-// #include "ggml-quants.h"
 #include "llama.h"
 #include "llama-impl.h"
 #include "llama-model.h"

From 8195ad644b570914aa6e9fc80d1e0a062be2bb24 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sun, 8 Mar 2026 01:58:44 -0600
Subject: [PATCH 14/23] re-org includes

---
 src/llama-quant-scheduler.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 11eafc68dcd..5b7db2bc160 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -94,14 +94,15 @@
 #include "llama-model.h"
 #include "llama-quant.h"
 
-#include <stdint.h>
-#include <stdexcept>
-#include <condition_variable>
-#include <optional>
+#include <mutex>
 #include <thread>
 #include <vector>
 #include <atomic>
-#include <mutex>
+#include <stdint.h>
+#include <optional>
+#include <stdexcept>
+#include <type_traits>
+#include <condition_variable>
 
 // determine the dimension along which we can divide this tensor into `n` equally-sized chunks.
 // return 0, 1, 2, or 3. if none are divisible, return -1.

From c2e55ccefce913c261309a0d50abdafd8e41586d Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sun, 8 Mar 2026 16:05:41 -0500
Subject: [PATCH 15/23] reflect header changes from # 19770

---
 src/llama-quant.cpp         | 24 ++++++++++++++++++++++++
 src/llama-quant.h           | 23 -----------------------
 tools/quantize/quantize.cpp |  7 +++++++
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 23b7585adae..caf5d10a9d6 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -14,6 +14,30 @@
 #include <thread>
 #include <unordered_map>
 
+// result of parsing --tensor-type option
+// (changes to this struct must be reflected in tools/quantize/quantize.cpp)
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
+// tensor categorization - used to avoid repeated string matching in quantization logic.
+// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
+enum class tensor_category {
+    TOKEN_EMBD,
+    ATTENTION_Q,
+    ATTENTION_V,
+    ATTENTION_K,
+    ATTENTION_QKV,
+    ATTENTION_KV_B,
+    ATTENTION_OUTPUT,
+    FFN_UP,
+    FFN_GATE,
+    FFN_DOWN,
+    OUTPUT,
+    OTHER
+};
+
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
diff --git a/src/llama-quant.h b/src/llama-quant.h
index f75a812d44d..0604ea63bdb 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,29 +1,6 @@
 #pragma once
 #include <string>
 
-// result of parsing --tensor-type option
-struct tensor_type_option {
-    std::string name;
-    ggml_type   type = GGML_TYPE_COUNT;
-};
-
-// tensor categorization - used to avoid repeated string matching in quantization logic.
-// this is different from LLM_TN - we want broad categories, not specific tensor names per arch.
-enum class tensor_category {
-    TOKEN_EMBD,
-    ATTENTION_Q,
-    ATTENTION_V,
-    ATTENTION_K,
-    ATTENTION_QKV,
-    ATTENTION_KV_B,
-    ATTENTION_OUTPUT,
-    FFN_UP,
-    FFN_GATE,
-    FFN_DOWN,
-    OUTPUT,
-    OTHER
-};
-
 // per-tensor info needed by the quantization work scheduler.
 // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise.
 struct tensor_sched_data {
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 7aa84859bc1..04276742d5c 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -19,6 +19,13 @@
 #include <algorithm>
 #include <filesystem>
 
+// result of parsing --tensor-type option
+// (changes to this struct must be reflected in src/llama-quant.cpp)
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 struct quant_option {
     std::string name;
     llama_ftype ftype;

From 8e3c6890e807c389b7a1da6bc5e3ef8ee031e2c5 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sun, 8 Mar 2026 16:13:55 -0500
Subject: [PATCH 16/23] WIP

---
 src/llama-quant.cpp         |  6 +++---
 tools/quantize/quantize.cpp | 15 +++++++--------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index caf5d10a9d6..74ff7f75caa 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -16,9 +16,9 @@
 
 // result of parsing --tensor-type option
 // (changes to this struct must be reflected in tools/quantize/quantize.cpp)
-struct tensor_quantization {
+struct tensor_type_option {
     std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
+    ggml_type type = GGML_TYPE_COUNT;
 };
 
 // tensor categorization - used to avoid repeated string matching in quantization logic.
@@ -189,7 +189,7 @@ struct quantize_state_impl {
         : model(model)
         , params(params)
     {
-        // compile regex patterns once - they are expensive
+        // compile regex patterns just once - they could be expensive
         if (params->tensor_types) {
             const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
             for (const auto & [name, type] : tensor_types) {
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 04276742d5c..b84b2b6e554 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -1,7 +1,6 @@
 #include "common.h"
 #include "llama.h"
 #include "gguf.h"
-#include "../src/llama-quant.h"
 
 #include <algorithm>
 #include <cctype>
@@ -21,9 +20,9 @@
 
 // result of parsing --tensor-type option
 // (changes to this struct must be reflected in src/llama-quant.cpp)
-struct tensor_quantization {
+struct tensor_type_option {
     std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
+    ggml_type type = GGML_TYPE_COUNT;
 };
 
 struct quant_option {
@@ -503,7 +502,7 @@ int main(int argc, char ** argv) {
     std::string imatrix_file;
     std::vector<std::string> included_weights, excluded_weights;
     std::vector<llama_model_kv_override> kv_overrides;
-    std::vector<tensor_type_option> tensor_types;
+    std::vector<tensor_type_option> tensor_type_options;
     std::vector<int> prune_layers;
 
     for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
@@ -528,11 +527,11 @@ int main(int argc, char ** argv) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) {
-            if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
+            if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_type_options)) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) {
-            if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) {
+            if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_type_options)) {
                 usage(argv[0]);
             }
         } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
@@ -626,8 +625,8 @@ int main(int argc, char ** argv) {
         kv_overrides.back().key[0] = 0;
         params.kv_overrides = &kv_overrides;
     }
-    if (!tensor_types.empty()) {
-        params.tensor_types = &tensor_types;
+    if (!tensor_type_options.empty()) {
+        params.tensor_types = &tensor_type_options;
     }
     if (!prune_layers.empty()) {
         params.prune_layers = &prune_layers;

From fb728ac63bb6bd99999c7a1c7f2d0aca52de541d Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Mon, 9 Mar 2026 15:39:44 -0500
Subject: [PATCH 17/23] WIP

---
 src/llama-quant-scheduler.cpp | 109 ++++++++++++++++------------------
 1 file changed, 51 insertions(+), 58 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 5b7db2bc160..b69c8c7b87e 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -60,30 +60,6 @@
  *
  * [LLM: fill in this section as you like with your own notes, separate from the human dev]
  *
- * ARCHITECTURE REVIEW & RECOMMENDATIONS
- * -------------------------------------
- * 1. Synchronization Primitives:
- *    - Replace atomic bool polling with std::condition_variable to prevent busy-waiting.
- *    - Ensure memory_order_acquire/release is used if sticking with atomics for flags.
- *
- * 2. Buffering Strategy:
- *    - Current single-buffer design couples I/O and Compute latency.
- *    - Recommendation: Implement double-buffering (ping-pong) for 'buf_read' to allow
- *      loading Tensor N+1 while Computing Tensor N.
- *
- * 3. Exception Handling:
- *    - Change compute_pool::opt_exc from std::optional<std::exception> to 
- *      std::optional<std::exception_ptr> to avoid object slicing.
- *    - Add a global 'stop_flag' to the scheduler to terminate all workers if one fails.
- *
- * 4. Compatibility:
- *    - Remove <stdfloat> include. llama.cpp targets C++17; std::float_t is C++23.
- *    - Remove std::optional wrappers on buffers; they are always initialized.
- *
- * 5. Thread Pool:
- *    - compute_pool constructor must launch worker threads with a wait-loop, 
- *      not just resize the vector.
- *
  * -----------------------------
  *
  * [NOTE: delete this comment block before PR]
@@ -91,7 +67,6 @@
 
 #include "llama.h"
 #include "llama-impl.h"
-#include "llama-model.h"
 #include "llama-quant.h"
 
 #include <mutex>
@@ -99,51 +74,60 @@
 #include <vector>
 #include <atomic>
 #include <stdint.h>
-#include <optional>
-#include <stdexcept>
 #include <type_traits>
 #include <condition_variable>
 
 // determine the dimension along which we can divide this tensor into `n` equally-sized chunks.
 // return 0, 1, 2, or 3. if none are divisible, return -1.
 static int get_split_dim(const tensor_sched_data & tsd, const int64_t n) {
-    if (tsd.ne0 > 1 && tsd.ne0 % n == 0) return 0;
-    if (tsd.ne1 > 1 && tsd.ne1 % n == 0) return 1;
-    if (tsd.ne2 > 1 && tsd.ne2 % n == 0) return 2;
-    if (tsd.ne3 > 1 && tsd.ne3 % n == 0) return 3;
+    if (tsd.ne0 > n && tsd.ne0 % n == 0) return 0;
+    if (tsd.ne1 > n && tsd.ne1 % n == 0) return 1;
+    if (tsd.ne2 > n && tsd.ne2 % n == 0) return 2;
+    if (tsd.ne3 > n && tsd.ne3 % n == 0) return 3;
     return -1;
 }
 
-template <typename T>
-struct sched_buffer {
+template <typename T> struct sched_buffer {
     static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, float>,
                   "sched_buffer<T> only supports uint8_t and float");
 
     std::vector<T>          buf;
     std::mutex              mtx;
-    std::atomic<int64_t>    idx; // which tensor is currently / most recently stored? (-1 if none)
-    std::condition_variable cv;
+    std::atomic<int64_t>    idx; // which tensor is currently or most recently stored? (-1 at init, then 0 for 1st tensor, 1 for 2nd tensor...)
     std::atomic<bool>       has_data;
+    std::condition_variable cv;
 
-    sched_buffer(const size_t _size): buf(_size), has_data(false), idx(-1) {}
+    // init but don't allocate the buffer yet
+    sched_buffer():
+        has_data(false), idx(-1)
+    {}
 
-    // producer calls this when data is written
-    void notify_ready() {
+    // allocate the buffer and return the allocated size in bytes
+    size_t allocate(const size_t _size) {
+        buf.resize(_size);
+        return sizeof(T) * _size;
+    }
+
+    // signal to workers that this buffer now has data for tensor at index `_idx`.
+    // this updates the buffer's `idx` to match. all indices must be sequential.
+    void signal_has_data(const int64_t _idx) {
         {
             std::lock_guard<std::mutex> lock(mtx);
+            GGML_ASSERT(_idx == idx + 1 && "buffer tensor indices must be sequential");
             has_data = true;
+            idx = _idx;
         }
         cv.notify_one();
     }
 
-    // consumer calls this to wait for data
-    void wait_ready() {
+    // workers call this function to wait for data in this buffer.
+    void wait_for_data() {
         std::unique_lock<std::mutex> lock(mtx);
         cv.wait(lock, [this]{ return has_data; });
     }
 
-    // consumer calls this when done processing to release buffer
-    void release() {
+    // signal to workers that this buffer should no longer be read from.
+    void signal_no_data() {
         {
             std::lock_guard<std::mutex> lock(mtx);
             has_data = false;
@@ -187,20 +171,24 @@ struct scheduler {
     // per-tensor data needed by the scheduler for all model tensors
     std::vector<tensor_sched_data> tsd_vec;
 
-    size_t max_src_sz; // size of largest tensor to be quantized (as src type) in bytes
-    size_t max_f32_sz; // size of largest tensor to be quantized (as float32) in bytes
-    size_t max_dst_sz; // size of largest tensor to be quantized (as dst type) in bytes
+    size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) in bytes
+    size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) in bytes
+    size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) in bytes
 
     //
     // scheduler pipeline buffers (one of each at most)
     //
 
     // size: max_src_sz
-    std::optional<sched_buffer<uint8_t>> buf_read;    // hold source tensor data for reading 
+    sched_buffer<uint8_t> buf_read;        // tensor data is read into here as fast as possible (read worker keeps it full).
+    // size: max_src_sz
+    sched_buffer<uint8_t> buf_compute_src; // compute workers read src tensor data from here
     // size: max_f32_sz
-    std::optional<sched_buffer<float>>   buf_dequant; // hold dequantized tensor data
+    sched_buffer<float>   buf_compute_f32; // intermediate f32 tensor data (if necessary)
+    // size = max_dst_sz
+    sched_buffer<uint8_t> buf_compute_dst; // compute workers write dst tensor data into here
     // size = max_dst_sz
-    std::optional<sched_buffer<uint8_t>> buf_write;   // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file)
+    sched_buffer<uint8_t> buf_write;       // tensor data is written to the output stream IN ORDER by the write worker.
 
     compute_pool pool;
 
@@ -208,36 +196,41 @@ struct scheduler {
     scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _tsd_vec):
         n_threads(_n_threads),
         tsd_vec(_tsd_vec),
-        max_src_sz(0), max_f32_sz(0), max_dst_sz(0),
-        buf_read(std::nullopt), buf_dequant(std::nullopt), buf_write(std::nullopt),
         pool(_n_threads)
     {
         GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function");
         for (int32_t idx = 0; idx < tsd_vec.size(); ++idx) {
             const auto & data = tsd_vec[idx];
-            const int64_t nrows = data.ne1 * data.ne2 * data.ne3;
+            const size_t nrows = data.ne1 * data.ne2 * data.ne3;
             max_src_sz = std::max(max_src_sz, nrows * ggml_row_size(data.src_type, data.ne0));
             max_f32_sz = std::max(max_f32_sz, nrows * ggml_row_size(GGML_TYPE_F32, data.ne0));
             max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0));
         }
 
-        LLAMA_LOG_DEBUG("%s:           allocating read buffer ... ", __func__);
-        buf_read.emplace(max_src_sz);
+        LLAMA_LOG_DEBUG("%s:        allocating read buffer ... ", __func__);
+        GGML_ASSERT(max_src_sz == buf_read.allocate(max_src_sz)); 
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0);
 
-        LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__);
-        buf_dequant.emplace(max_f32_sz);
+        LLAMA_LOG_DEBUG("%s: allocating compute src buffer ... ", __func__);
+        GGML_ASSERT(max_src_sz == buf_compute_src.allocate(max_src_sz)); 
+        LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0);
+
+        LLAMA_LOG_DEBUG("%s: allocating compute f32 buffer ... ", __func__);
+        GGML_ASSERT(max_f32_sz == buf_compute_f32.allocate(max_f32_sz / sizeof(float)));
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0);
 
-        LLAMA_LOG_DEBUG("%s:          allocating write buffer ... ", __func__);
-        buf_write.emplace(max_dst_sz);
+        LLAMA_LOG_DEBUG("%s: allocating compute dst buffer ... ", __func__);
+        GGML_ASSERT(max_dst_sz == buf_compute_dst.allocate(max_dst_sz));
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0);
 
+        LLAMA_LOG_DEBUG("%s:       allocating write buffer ... ", __func__);
+        GGML_ASSERT(max_dst_sz == buf_write.allocate(max_dst_sz));
+        LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0);
     }
 
     void run() {
         // TODO: start `read_worker` thread
-        // TODO: THIS thread should manage the compute pool
+        // TODO: start `compute_worker` thread (?)
         // TODO: start `write_worker` thread
         // throw std::runtime_error if something fails
     }

From 30621906c80f2b0915d1477399a9678ba9855f94 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Mon, 9 Mar 2026 17:01:29 -0500
Subject: [PATCH 18/23] WIP

---
 src/llama-quant-scheduler.cpp | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index b69c8c7b87e..4093f1d500f 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -180,18 +180,22 @@ struct scheduler {
     //
 
     // size: max_src_sz
-    sched_buffer<uint8_t> buf_read;        // tensor data is read into here as fast as possible (read worker keeps it full).
+    sched_buffer<uint8_t> buf_read;        // tensor data is read into here as fast as possible by `reader_th`
     // size: max_src_sz
-    sched_buffer<uint8_t> buf_compute_src; // compute workers read src tensor data from here
+    sched_buffer<uint8_t> buf_compute_src; // compute pool reads src tensor data from here
     // size: max_f32_sz
     sched_buffer<float>   buf_compute_f32; // intermediate f32 tensor data (if necessary)
     // size = max_dst_sz
-    sched_buffer<uint8_t> buf_compute_dst; // compute workers write dst tensor data into here
+    sched_buffer<uint8_t> buf_compute_dst; // compute pool writes dst tensor data into here
     // size = max_dst_sz
-    sched_buffer<uint8_t> buf_write;       // tensor data is written to the output stream IN ORDER by the write worker.
+    sched_buffer<uint8_t> buf_write;       // tensor data is written from here to the output stream (IN ORDER) by `writer_th`
 
     compute_pool pool;
 
+    std::thread reader_th;  // constantly reading tensor data from the original model into buf_read.
+    std::thread compute_th; // manages compute_pool (exceptions, stopping, etc.)
+    std::thread writer_th;  // constantly writing tensor data from buf_write to the output stream IN ORDER.
+
     // init
     scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _tsd_vec):
         n_threads(_n_threads),
@@ -229,9 +233,9 @@ struct scheduler {
     }
 
     void run() {
-        // TODO: start `read_worker` thread
-        // TODO: start `compute_worker` thread (?)
-        // TODO: start `write_worker` thread
+        // TODO: start `reader_th` thread
+        // TODO: start `compute_th` thread
+        // TODO: start `writer_th` thread
         // throw std::runtime_error if something fails
     }
 

From 7d93875604eab61c919dac13e813a4fb0dcadbb6 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Mon, 9 Mar 2026 21:30:36 -0500
Subject: [PATCH 19/23] WIP

---
 src/CMakeLists.txt            |  1 +
 src/llama-quant-scheduler.cpp | 60 ++++++++++++++++++++++++-----------
 2 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 283823fa9c8..a98ed4e1cd0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,6 +31,7 @@ add_library(llama
             llama-model-saver.cpp
             llama-model.cpp
             llama-quant.cpp
+            llama-quant-scheduler.cpp
             llama-sampler.cpp
             llama-vocab.cpp
             unicode-data.cpp
diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 4093f1d500f..5b44da5ea59 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -77,16 +77,6 @@
 #include <type_traits>
 #include <condition_variable>
 
-// determine the dimension along which we can divide this tensor into `n` equally-sized chunks.
-// return 0, 1, 2, or 3. if none are divisible, return -1.
-static int get_split_dim(const tensor_sched_data & tsd, const int64_t n) {
-    if (tsd.ne0 > n && tsd.ne0 % n == 0) return 0;
-    if (tsd.ne1 > n && tsd.ne1 % n == 0) return 1;
-    if (tsd.ne2 > n && tsd.ne2 % n == 0) return 2;
-    if (tsd.ne3 > n && tsd.ne3 % n == 0) return 3;
-    return -1;
-}
-
 template <typename T> struct sched_buffer {
     static_assert(std::is_same_v<T, uint8_t> || std::is_same_v<T, float>,
                   "sched_buffer<T> only supports uint8_t and float");
@@ -98,9 +88,7 @@ template <typename T> struct sched_buffer {
     std::condition_variable cv;
 
     // init but don't allocate the buffer yet
-    sched_buffer():
-        has_data(false), idx(-1)
-    {}
+    sched_buffer(): has_data(false), idx(-1) {}
 
     // allocate the buffer and return the allocated size in bytes
     size_t allocate(const size_t _size) {
@@ -109,11 +97,11 @@ template <typename T> struct sched_buffer {
     }
 
     // signal to workers that this buffer now has data for tensor at index `_idx`.
-    // this updates the buffer's `idx` to match. all indices must be sequential.
+    // this updates the buffer's `idx` to match. indices must be sequential.
     void signal_has_data(const int64_t _idx) {
         {
             std::lock_guard<std::mutex> lock(mtx);
-            GGML_ASSERT(_idx == idx + 1 && "buffer tensor indices must be sequential");
+            GGML_ASSERT(_idx == idx + 1 && "tensor buffer indices must be sequential");
             has_data = true;
             idx = _idx;
         }
@@ -136,6 +124,42 @@ template <typename T> struct sched_buffer {
     }
 };
 
+struct read_worker {
+    std::thread thread;
+    const sched_buffer<uint8_t> & buf;
+
+    read_worker(const sched_buffer<uint8_t> & _buf): buf(_buf) {
+        // TODO: init?
+    };
+
+    ~read_worker() {
+        // TODO: safe stoppage + destruction of thread
+    }
+};
+
+struct write_worker {
+    std::thread thread;
+    const sched_buffer<uint8_t> & buf;
+
+    write_worker(const sched_buffer<uint8_t> & _buf): buf(_buf) {
+        // TODO: init?
+    };
+
+    ~write_worker() {
+        // TODO: safe stoppage + destruction of thread
+    }
+};
+
+// determine the dimension along which we can divide this tensor into `n` equally-sized chunks.
+// return 0, 1, 2, or 3. if none are divisible, return -1.
+static int get_split_dim(const std::vector<int64_t> & ne, const int64_t n) {
+    if (ne[0] > n && ne[0] % n == 0) return 0;
+    if (ne[1] > n && ne[1] % n == 0) return 1;
+    if (ne[2] > n && ne[2] % n == 0) return 2;
+    if (ne[3] > n && ne[3] % n == 0) return 3;
+    return -1;
+}
+
 // pool of worker threads used for dequantization + quantization
 struct compute_pool {
     const int32_t            n_threads;
@@ -192,9 +216,9 @@ struct scheduler {
 
     compute_pool pool;
 
-    std::thread reader_th;  // constantly reading tensor data from the original model into buf_read.
-    std::thread compute_th; // manages compute_pool (exceptions, stopping, etc.)
-    std::thread writer_th;  // constantly writing tensor data from buf_write to the output stream IN ORDER.
+    // std::thread reader_th;  // constantly reading tensor data from the original model into buf_read.
+    // std::thread compute_th; // manages compute_pool (exceptions, stopping, etc.)
+    // std::thread writer_th;  // constantly writing tensor data from buf_write to the output stream IN ORDER.
 
     // init
     scheduler(const int32_t _n_threads, std::vector<tensor_sched_data> _tsd_vec):

From 8368609d0ead4011208630347809a25d830ee9c2 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 9 Mar 2026 22:00:22 -0500
Subject: [PATCH 20/23] WIP

---
 src/llama-quant.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/llama-quant.h b/src/llama-quant.h
index 0604ea63bdb..ea3bc67d914 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,5 +1,4 @@
 #pragma once
-#include <string>
 
 // per-tensor info needed by the quantization work scheduler.
 // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise.

From f5c0c38ee2a1f72dd8959a41ee481a0d781d3f4b Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Mon, 9 Mar 2026 22:10:19 -0500
Subject: [PATCH 21/23] WIP

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index d82cdc5fb67..55943370bd8 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -180,7 +180,7 @@ struct quantize_state_impl {
     bool has_imatrix = false;
 
     // used to figure out if a model has tied embeddings (tok_embd shares weights with output)
-    bool has_tied_embeddings = false; // assume tied until we see output.weight
+    bool has_tied_embeddings = true; // assume tied until we see output.weight
 
     // tensor type override patterns (compiled once, used twice)
     std::vector<std::pair<std::regex, ggml_type>> tensor_type_patterns;

From 3db73fcb83d51197cdf5ec348703b8e9496ef728 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 10 Mar 2026 02:19:42 -0500
Subject: [PATCH 22/23] WIP

---
 src/llama-quant-scheduler.cpp |  9 ++-------
 src/llama-quant.h             | 23 +++++++++++++++--------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp
index 5b44da5ea59..19c566149a6 100644
--- a/src/llama-quant-scheduler.cpp
+++ b/src/llama-quant-scheduler.cpp
@@ -256,12 +256,7 @@ struct scheduler {
         LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0);
     }
 
-    void run() {
-        // TODO: start `reader_th` thread
-        // TODO: start `compute_th` thread
-        // TODO: start `writer_th` thread
-        // throw std::runtime_error if something fails
-    }
-
     ~scheduler() = default;
 };
+
+void scheduler::run() {};
\ No newline at end of file
diff --git a/src/llama-quant.h b/src/llama-quant.h
index ea3bc67d914..c340a7e3845 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -3,12 +3,19 @@
 // per-tensor info needed by the quantization work scheduler.
 // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise.
 struct tensor_sched_data {
-    const ggml_type    src_type; // source tensor type
-    const ggml_type    dst_type; // destination tensor type
-    const int64_t      ne0;      // n_cols
-    const int64_t      ne1;      // n_rows
-    const int64_t      ne2;      // n_expert (or any 3rd tensor dimension)
-    const int64_t      ne3;      // any 4th tensor dimension (currently unused, always 1)
-    const void * const src_data; // pointer to raw source tensor data buffer, read-only
-    const void * const imatrix;  // pointer to imatrix data, or nullptr, read-only
+    const std::vector<int64_t> ne;
+    const ggml_type src_type;
+    const ggml_type dst_type;
+    const void * src_data; // pointer to raw source tensor data buffer, read-only
+    const void * imatrix;  // pointer to imatrix data, or nullptr, read-only
+    tensor_sched_data(
+        const std::vector<int64_t> _ne,
+        const ggml_type _src_type,
+        const ggml_type _dst_type,
+        const void * _src_data,
+        const void * _imatrix
+    )
+    {
+        
+    }
 };

From 16bef330b9d0f1a53051263765702ce47f4302bb Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 25 Mar 2026 02:59:31 -0500
Subject: [PATCH 23/23] begin to adopt command pattern

---
 src/llama-quant.h | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/llama-quant.h b/src/llama-quant.h
index c340a7e3845..d1c4cc9fdd6 100644
--- a/src/llama-quant.h
+++ b/src/llama-quant.h
@@ -1,21 +1,25 @@
 #pragma once
 
-// per-tensor info needed by the quantization work scheduler.
-// constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise.
-struct tensor_sched_data {
-    const std::vector<int64_t> ne;
-    const ggml_type src_type;
-    const ggml_type dst_type;
-    const void * src_data; // pointer to raw source tensor data buffer, read-only
-    const void * imatrix;  // pointer to imatrix data, or nullptr, read-only
-    tensor_sched_data(
-        const std::vector<int64_t> _ne,
-        const ggml_type _src_type,
-        const ggml_type _dst_type,
-        const void * _src_data,
-        const void * _imatrix
-    )
-    {
-        
-    }
+enum sched_cmd_status {
+    CMD_STATUS_PENDING,
+    CMD_STATUS_IN_PROGRESS,
+    CMD_STATUS_COMPLETE,
+    CMD_STATUS_COUNT, // always last
+};
+
+// types of operations that performed the quantization work scheduler
+enum sched_cmd_type {
+    CMD_TYPE_READ,
+    CMD_TYPE_DEQUANTIZE,
+    CMD_TYPE_QUANTIZE,
+    CMD_TYPE_WRITE,
+    CMD_TYPE_COUNT // always last
+};
+
+// unit of work for the quantization work scheduler (command pattern)
+struct sched_cmd {
+    const ggml_tensor * tensor;
+    const enum sched_cmd_type sched_cmd_type;
+
+    std::atomic<enum sched_cmd_status> sched_cmd_status;
 };