From decff8b53afefd239d51f20a790419c66559de5c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 3 Mar 2026 18:15:49 -0600 Subject: [PATCH 01/23] quantize : imatrix-fail early + code cleanup --- src/llama-quant.cpp | 712 ++++++++++++++++++++++-------------- src/llama-quant.h | 24 ++ tools/quantize/quantize.cpp | 19 +- 3 files changed, 460 insertions(+), 295 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 24770430e1c..58ed0e9db7a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -1,11 +1,12 @@ +#include "llama.h" #include "llama-quant.h" #include "llama-impl.h" #include "llama-model.h" #include "llama-model-loader.h" -#include #include #include +#include #include #include #include @@ -13,12 +14,6 @@ #include #include -// Quantization types. Changes to this struct must be replicated in quantize.cpp -struct tensor_quantization { - std::string name; - ggml_type quant = GGML_TYPE_COUNT; -}; - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -54,7 +49,7 @@ static std::string remap_layer(const std::string & orig_name, const std::vector< return orig_name; } -static std::string remap_imatrix (const std::string & orig_name, const std::map & mapped) { +static std::string remap_imatrix(const std::string & orig_name, const std::map & mapped) { if (mapped.empty()) { return orig_name; } @@ -76,6 +71,73 @@ static std::string remap_imatrix (const std::string & orig_name, const std::map< return orig_name; } +// +// helper functions for tensor name matching +// + +static bool tensor_name_match_token_embd(const char * tensor_name) { + return std::strcmp(tensor_name, "token_embd.weight") == 0 || + std::strcmp(tensor_name, "per_layer_token_embd.weight") == 0; +} + +static bool tensor_name_match_output_weight(const char * tensor_name) { + return std::strcmp(tensor_name, "output.weight") == 0; +} + +// +// tensor categorization for quantization +// +// (this is different from LLM_TN - we want broad categories, not specific tensor names per arch) +// + +static tensor_category tensor_get_category(const std::string & tensor_name) { + if (tensor_name_match_output_weight(tensor_name.c_str())) { + return tensor_category::OUTPUT; + } + if (tensor_name_match_token_embd(tensor_name.c_str())) { + return tensor_category::TOKEN_EMBD; + } + if (tensor_name.find("attn_qkv.weight") != std::string::npos) { + return tensor_category::ATTENTION_QKV; + } + if (tensor_name.find("attn_kv_b.weight") != std::string::npos) { + return tensor_category::ATTENTION_KV_B; + } + if (tensor_name.find("attn_v.weight") != std::string::npos) { + return tensor_category::ATTENTION_V; + } + if (tensor_name.find("attn_k.weight") != std::string::npos) { + return tensor_category::ATTENTION_K; + } + if (tensor_name.find("attn_q.weight") != std::string::npos) { + return tensor_category::ATTENTION_Q; + } + if (tensor_name.find("attn_output.weight") != std::string::npos) { + return tensor_category::ATTENTION_OUTPUT; + } + if (tensor_name.find("ffn_up") != std::string::npos) { + return tensor_category::FFN_UP; + } + if (tensor_name.find("ffn_gate") != std::string::npos) { + return tensor_category::FFN_GATE; + } + if (tensor_name.find("ffn_down") != std::string::npos) { + return tensor_category::FFN_DOWN; + } + return tensor_category::OTHER; +} + +// check if category is for attention-v-like tensors (more sensitive to quantization) +static bool category_is_attn_v(tensor_category cat) { + return cat == tensor_category::ATTENTION_V || + cat == tensor_category::ATTENTION_QKV || + cat == tensor_category::ATTENTION_KV_B; +} + +// +// quantization state +// + struct quantize_state_impl { const llama_model & model; const llama_model_quantize_params * params; @@ -89,20 +151,43 @@ struct quantize_state_impl { int i_ffn_gate = 0; int i_ffn_up = 0; - int n_k_quantized = 0; int n_fallback = 0; bool has_imatrix = false; - // used to figure out if a model shares tok_embd with the output weight - bool has_output = false; + // used to figure out if a model has tied embeddings (tok_embd shares weights with output) + bool has_tied_embeddings = false; // assume tied until we see output.weight + + // tensor type override patterns (compiled once, used twice) + std::vector> tensor_type_patterns; quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) : model(model) , params(params) - {} + { + // compile regex patterns once - they are expensive + if (params->tensor_types) { + const auto & tensor_types = *static_cast *>(params->tensor_types); + for (const auto & [tname, qtype] : tensor_types) { + tensor_type_patterns.emplace_back(std::regex(tname), qtype); + } + } + } }; +// per-tensor metadata, computed in the preliminary loop and used in the main loop +struct tensor_metadata { + ggml_type target_type; + tensor_category category; + std::string remapped_imatrix_name; + bool allows_quantization; + bool requires_imatrix; +}; + +// +// dequantization +// + static void llama_tensor_dequantize_impl( ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread @@ -175,12 +260,132 @@ static void llama_tensor_dequantize_impl( workers.clear(); } -static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { +// +// do we allow this tensor to be quantized? +// + +static bool tensor_allows_quantization(const llama_model_quantize_params * params, llm_arch arch, const ggml_tensor * tensor) { + // trivial checks first -- no string ops needed + if (params->only_copy) return false; + + // quantize only 2D and 3D tensors (experts) + if (ggml_n_dims(tensor) < 2) return false; + + const std::string name = ggml_get_name(tensor); + + // This used to be a regex, but has an extreme cost to compile times. + bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? + + // do not quantize norm tensors + quantize &= name.find("_norm.weight") == std::string::npos; + + quantize &= params->quantize_output_tensor || name != "output.weight"; + + // do not quantize expert gating tensors + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; + + // these are very small (e.g. 4x4) + quantize &= name.find("altup") == std::string::npos; + quantize &= name.find("laurel") == std::string::npos; + + // these are not too big so keep them as it is + quantize &= name.find("per_layer_model_proj") == std::string::npos; + + // do not quantize positional embeddings and token types (BERT) + quantize &= name != LLM_TN(arch)(LLM_TENSOR_POS_EMBD, "weight"); + quantize &= name != LLM_TN(arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); + + // do not quantize Mamba/Kimi's small conv1d weights + // NOTE: can't use LLM_TN here because the layer number is not known + quantize &= name.find("ssm_conv1d") == std::string::npos; + quantize &= name.find("shortconv.conv.weight") == std::string::npos; + + // do not quantize RWKV's small yet 2D weights + quantize &= name.find("time_mix_first.weight") == std::string::npos; + quantize &= name.find("time_mix_w0.weight") == std::string::npos; + quantize &= name.find("time_mix_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_v0.weight") == std::string::npos; + quantize &= name.find("time_mix_v1.weight") == std::string::npos; + quantize &= name.find("time_mix_v2.weight") == std::string::npos; + quantize &= name.find("time_mix_a0.weight") == std::string::npos; + quantize &= name.find("time_mix_a1.weight") == std::string::npos; + quantize &= name.find("time_mix_a2.weight") == std::string::npos; + quantize &= name.find("time_mix_g1.weight") == std::string::npos; + quantize &= name.find("time_mix_g2.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; + quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; + quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; + + // do not quantize relative position bias (T5) + quantize &= name.find("attn_rel_b.weight") == std::string::npos; + + // do not quantize specific multimodal tensors + quantize &= name.find(".position_embd.") == std::string::npos; + + return quantize; +} + +// +// tensor type selection +// + +// incompatible tensor shapes are handled here - fallback to a compatible type +static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tensor * t, const ggml_type target_type) { + ggml_type return_type = target_type; + + const int64_t ncols = t->ne[0]; + const int64_t qk_k = ggml_blck_size(target_type); + + if (ncols % qk_k != 0) { // this tensor's shape is incompatible with this quant + LLAMA_LOG_WARN("warning: %-36s - ncols %6" PRId64 " not divisible by %3" PRId64 " (required for type %7s) ", + t->name, ncols, qk_k, ggml_type_name(target_type)); + ++qs.n_fallback; + + switch (target_type) { + // types on the left: block size 256 + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: // types on the right: block size 32 + case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: return_type = GGML_TYPE_Q4_0; break; + case GGML_TYPE_Q4_K: return_type = GGML_TYPE_Q5_0; break; + case GGML_TYPE_Q5_K: return_type = GGML_TYPE_Q5_1; break; + case GGML_TYPE_Q6_K: return_type = GGML_TYPE_Q8_0; break; + default: + throw std::runtime_error(format("no tensor type fallback is defined for type %s", + ggml_type_name(target_type))); + } + if (ncols % ggml_blck_size(return_type) != 0) { + // + // the fallback return type is still not compatible for this tensor! + // + // most likely, this tensor's first dimension is not divisible by 32. + // this is very rare. we can either abort the quantization, or + // fallback to F16 / F32. + // + LLAMA_LOG_WARN("(WARNING: must use F16 due to unusual shape) "); + return_type = GGML_TYPE_F16; + } + LLAMA_LOG_WARN("-> falling back to %7s\n", ggml_type_name(return_type)); + } + return return_type; +} + +// internal standard logic for selecting the target tensor type based on tensor category, ftype, and model arch +static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype, tensor_category category) { const std::string name = ggml_get_name(tensor); // TODO: avoid hardcoded tensor names - use the TN_* constants const llm_arch arch = qs.model.arch; - const auto tn = LLM_TN(arch); auto use_more_bits = [](int i_layer, int n_layers) -> bool { return i_layer < n_layers/8 || i_layer >= 7*n_layers/8 || (i_layer - n_layers/8)%3 == 2; @@ -204,7 +409,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // with the quantization of the output tensor - if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { + if (category == tensor_category::OUTPUT || (qs.has_tied_embeddings && category == tensor_category::TOKEN_EMBD)) { if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { new_type = qs.params->output_tensor_type; } else { @@ -234,7 +439,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } else { new_type = GGML_TYPE_Q8_0; } - } else if (name == "token_embd.weight" || name == "per_layer_token_embd.weight") { + } else if (category == tensor_category::TOKEN_EMBD) { if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { new_type = qs.params->token_embedding_type; } else { @@ -254,21 +459,21 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { - if (name.find("attn_v.weight") != std::string::npos) { + if (category_is_attn_v(category)) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; ++qs.i_attention_wv; } - else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) { + else if (qs.model.hparams.n_expert == 8 && category == tensor_category::ATTENTION_K) { new_type = GGML_TYPE_Q4_K; } - else if (name.find("ffn_down") != std::string::npos) { + else if (category == tensor_category::FFN_DOWN) { if (qs.i_ffn_down < qs.n_ffn_down/8) { new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; } ++qs.i_ffn_down; } - else if (name.find("attn_output.weight") != std::string::npos) { + else if (category == tensor_category::ATTENTION_OUTPUT) { if (qs.model.hparams.n_expert == 8) { new_type = GGML_TYPE_Q5_K; } else { @@ -276,7 +481,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; } } - } else if (name.find("attn_v.weight") != std::string::npos) { + } else if (category_is_attn_v(category)) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } @@ -314,7 +519,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = GGML_TYPE_Q8_0; } ++qs.i_attention_wv; - } else if (name.find("attn_k.weight") != std::string::npos) { + } else if (category == tensor_category::ATTENTION_K) { if (qs.model.hparams.n_expert == 8) { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB // TODO: explore better strategies @@ -326,14 +531,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } - } else if (name.find("attn_q.weight") != std::string::npos) { + } else if (category == tensor_category::ATTENTION_Q) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_XXS; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } - } else if (name.find("ffn_down") != std::string::npos) { + } else if (category == tensor_category::FFN_DOWN) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; @@ -378,7 +583,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1; } ++qs.i_ffn_down; - } else if (name.find("attn_output.weight") != std::string::npos) { + } else if (category == tensor_category::ATTENTION_OUTPUT) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || @@ -398,14 +603,14 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; } } - else if (name.find("attn_qkv.weight") != std::string::npos) { + else if (category == tensor_category::ATTENTION_QKV) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } - else if (name.find("ffn_gate") != std::string::npos) { + else if (category == tensor_category::FFN_GATE) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { @@ -413,7 +618,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t } ++qs.i_ffn_gate; } - else if (name.find("ffn_up") != std::string::npos) { + else if (category == tensor_category::FFN_UP) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { @@ -425,6 +630,54 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t return new_type; } +// outer wrapper: determine the ggml_type that this tensor should be quantized to +static ggml_type llama_tensor_get_type(quantize_state_impl & qs, const llama_model_quantize_params * params, const ggml_tensor * tensor, ggml_type default_type, const tensor_metadata & tm) { + if (!tensor_allows_quantization(params, qs.model.arch, tensor)) { + return tensor->type; + } + if (params->token_embedding_type < GGML_TYPE_COUNT && tm.category == tensor_category::TOKEN_EMBD) { + return params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && tm.category == tensor_category::OUTPUT) { + return params->output_tensor_type; + } + + ggml_type new_type = default_type; + + // get more optimal quantization type based on the tensor shape, layer, etc. + if (!params->pure && ggml_is_quantized(default_type)) { + // if the user provided tensor types - use those + bool manual = false; + if (!qs.tensor_type_patterns.empty()) { + const std::string tensor_name(tensor->name); + for (const auto & [pattern, qtype] : qs.tensor_type_patterns) { + if (std::regex_search(tensor_name, pattern)) { + if (qtype != new_type) { + LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); + new_type = qtype; + manual = true; + break; + } + } + } + } + + // if not manual - use the standard logic for choosing the quantization type based on the selected mixture + if (!manual) { + new_type = llama_tensor_get_type_impl(qs, new_type, tensor, params->ftype, tm.category); + } + + // incompatible tensor shapes are handled here - fallback to a compatible type + new_type = tensor_type_fallback(qs, tensor, new_type); + } + + return new_type; +} + +// +// quantization implementation +// + static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread @@ -479,61 +732,85 @@ static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * return new_size; } -static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) { - return ( - dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS || - dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S || - dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M || - ( // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings - dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0 - ) - ); +// +// imatrix requirement check +// + +static bool tensor_requires_imatrix(const char * tensor_name, const ggml_type dst_type, const llama_ftype ftype) { + if (tensor_name_match_token_embd(tensor_name) || tensor_name_match_output_weight(tensor_name)) { + return false; + } + switch (dst_type) { + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ1_S: + return true; + case GGML_TYPE_Q2_K: + // as a general rule, the k-type quantizations don't require imatrix data. + // the only exception is Q2_K tensors that are part of a Q2_K_S file. + return ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S; + default: + return false; + } } -static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { - ggml_type default_type; - llama_ftype ftype = params->ftype; +// +// given a file type, get the default tensor type +// - switch (params->ftype) { - case LLAMA_FTYPE_MOSTLY_Q4_0: default_type = GGML_TYPE_Q4_0; break; - case LLAMA_FTYPE_MOSTLY_Q4_1: default_type = GGML_TYPE_Q4_1; break; - case LLAMA_FTYPE_MOSTLY_Q5_0: default_type = GGML_TYPE_Q5_0; break; - case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; - case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; - case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; - case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; - case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; +static ggml_type llama_ftype_get_default_type(llama_ftype ftype) { + switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: return GGML_TYPE_Q4_0; + case LLAMA_FTYPE_MOSTLY_Q4_1: return GGML_TYPE_Q4_1; + case LLAMA_FTYPE_MOSTLY_Q5_0: return GGML_TYPE_Q5_0; + case LLAMA_FTYPE_MOSTLY_Q5_1: return GGML_TYPE_Q5_1; + case LLAMA_FTYPE_MOSTLY_Q8_0: return GGML_TYPE_Q8_0; + case LLAMA_FTYPE_MOSTLY_F16: return GGML_TYPE_F16; + case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16; + case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32; - case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: default_type = GGML_TYPE_MXFP4; break; + case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4; // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: - case LLAMA_FTYPE_MOSTLY_Q2_K: default_type = GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XS: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_Q2_K: return GGML_TYPE_Q2_K; + case LLAMA_FTYPE_MOSTLY_IQ3_XS: return GGML_TYPE_IQ3_S; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: - case LLAMA_FTYPE_MOSTLY_Q3_K_L: default_type = GGML_TYPE_Q3_K; break; + case LLAMA_FTYPE_MOSTLY_Q3_K_L: return GGML_TYPE_Q3_K; case LLAMA_FTYPE_MOSTLY_Q4_K_S: - case LLAMA_FTYPE_MOSTLY_Q4_K_M: default_type = GGML_TYPE_Q4_K; break; + case LLAMA_FTYPE_MOSTLY_Q4_K_M: return GGML_TYPE_Q4_K; case LLAMA_FTYPE_MOSTLY_Q5_K_S: - case LLAMA_FTYPE_MOSTLY_Q5_K_M: default_type = GGML_TYPE_Q5_K; break; - case LLAMA_FTYPE_MOSTLY_Q6_K: default_type = GGML_TYPE_Q6_K; break; - case LLAMA_FTYPE_MOSTLY_TQ1_0: default_type = GGML_TYPE_TQ1_0; break; - case LLAMA_FTYPE_MOSTLY_TQ2_0: default_type = GGML_TYPE_TQ2_0; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XXS: default_type = GGML_TYPE_IQ2_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_XS: default_type = GGML_TYPE_IQ2_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_S: default_type = GGML_TYPE_IQ2_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; - case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; - case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; - case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; - case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_Q5_K_M: return GGML_TYPE_Q5_K; + case LLAMA_FTYPE_MOSTLY_Q6_K: return GGML_TYPE_Q6_K; + case LLAMA_FTYPE_MOSTLY_TQ1_0: return GGML_TYPE_TQ1_0; + case LLAMA_FTYPE_MOSTLY_TQ2_0: return GGML_TYPE_TQ2_0; + case LLAMA_FTYPE_MOSTLY_IQ2_XXS: return GGML_TYPE_IQ2_XXS; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: return GGML_TYPE_IQ2_XS; + case LLAMA_FTYPE_MOSTLY_IQ2_S: return GGML_TYPE_IQ2_XS; + case LLAMA_FTYPE_MOSTLY_IQ2_M: return GGML_TYPE_IQ2_S; + case LLAMA_FTYPE_MOSTLY_IQ3_XXS: return GGML_TYPE_IQ3_XXS; + case LLAMA_FTYPE_MOSTLY_IQ1_S: return GGML_TYPE_IQ1_S; + case LLAMA_FTYPE_MOSTLY_IQ1_M: return GGML_TYPE_IQ1_M; + case LLAMA_FTYPE_MOSTLY_IQ4_NL: return GGML_TYPE_IQ4_NL; + case LLAMA_FTYPE_MOSTLY_IQ4_XS: return GGML_TYPE_IQ4_XS; + case LLAMA_FTYPE_MOSTLY_IQ3_S: + case LLAMA_FTYPE_MOSTLY_IQ3_M: return GGML_TYPE_IQ3_S; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } +} + +// +// main quantization driver +// + +static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { + ggml_type default_type; + llama_ftype ftype = params->ftype; int nthread = params->nthread; @@ -541,6 +818,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: nthread = std::thread::hardware_concurrency(); } + default_type = llama_ftype_get_default_type(ftype); + // mmap consistently increases speed on Linux, and also increases speed on Windows with // hot cache. It may cause a slowdown on macOS, possibly related to free memory. #if defined(__linux__) || defined(_WIN32) @@ -567,6 +846,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: quantize_state_impl qs(model, params); + // these need to be set to n_layer by default + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; + if (params->only_copy) { ftype = ml.ftype; } @@ -574,7 +856,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (params->imatrix) { imatrix_data = static_cast>*>(params->imatrix); if (imatrix_data) { - LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size())); + LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n", + __func__, (int)imatrix_data->size()); qs.has_imatrix = true; // check imatrix for nans or infs for (const auto & kv : *imatrix_data) { @@ -657,35 +940,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } - for (const auto * it : tensors) { - const struct ggml_tensor * tensor = it->tensor; - - const std::string name = ggml_get_name(tensor); - - // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_qkv.weight") != std::string::npos || - name.find("attn_kv_b.weight")!= std::string::npos) { - ++qs.n_attention_wv; - } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { - qs.has_output = true; - } - } - - qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - - size_t total_size_org = 0; - size_t total_size_new = 0; - - std::vector workers; - workers.reserve(nthread); - int idx = 0; - - std::vector> read_data; - std::vector> work; - std::vector> f32_conv_buf; - uint16_t n_split = 1; // Assume split index is continuous @@ -697,14 +952,62 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::vector ctx_outs(n_split); ctx_outs[0] = std::move(ctx_out); - // populate the original tensors so we get an initial meta data - for (const auto * it : tensors) { + // compute tensor metadata once and cache it + std::vector metadata(tensors.size()); + + // flag for --dry-run + bool will_require_imatrix = false; + + // + // preliminary iteration over all weights + // + + for (size_t i = 0; i < tensors.size(); ++i) { + const auto * it = tensors[i]; + const struct ggml_tensor * tensor = it->tensor; + const std::string name = ggml_get_name(tensor); + + metadata[i].category = tensor_get_category(name); + + if (category_is_attn_v(metadata[i].category)) { + ++qs.n_attention_wv; + } + + if (tensor_name_match_output_weight(name.c_str())) { + qs.has_tied_embeddings = false; + } + uint16_t i_split = params->keep_split ? it->idx : 0; - ggml_tensor * tensor = it->tensor; if (!ctx_outs[i_split]) { ctx_outs[i_split].reset(gguf_init_empty()); } gguf_add_tensor(ctx_outs[i_split].get(), tensor); + + metadata[i].allows_quantization = tensor_allows_quantization(params, model.arch, tensor); + + if (metadata[i].allows_quantization) { + metadata[i].target_type = llama_tensor_get_type(qs, params, tensor, default_type, metadata[i]); + } else { + metadata[i].target_type = tensor->type; + } + + metadata[i].requires_imatrix = tensor_requires_imatrix(tensor->name, metadata[i].target_type, ftype); + + if (params->imatrix) { + metadata[i].remapped_imatrix_name = remap_imatrix(tensor->name, mapped); + } else if (metadata[i].allows_quantization && metadata[i].requires_imatrix) { + if (params->dry_run) { + will_require_imatrix = true; + } else { + LLAMA_LOG_ERROR("\n============================================================================\n" + " ERROR: this quantization requires an importance matrix!\n" + " - offending tensor: %s\n" + " - target type: %s\n" + "============================================================================\n\n", + name.c_str(), ggml_type_name(metadata[i].target_type)); + throw std::runtime_error("this quantization requires an imatrix!"); + } + } } // Set split info if needed @@ -716,6 +1019,16 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } + size_t total_size_org = 0; + size_t total_size_new = 0; + + std::vector workers; + workers.reserve(nthread); + + std::vector> read_data; + std::vector> work; + std::vector> f32_conv_buf; + int cur_split = -1; std::ofstream fout; auto close_ofstream = [&]() { @@ -745,20 +1058,20 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ::zeros(fout, meta_size); }; - const auto tn = LLM_TN(model.arch); - // no output file for --dry-run if (!params->dry_run) { new_ofstream(0); } - // flag for `--dry-run`, to let the user know if imatrix will be required for a real - // quantization, as a courtesy - bool will_require_imatrix = false; + // + // main loop: iterate over all weights + // - for (const auto * it : tensors) { - const auto & weight = *it; + for (size_t i = 0; i < tensors.size(); ++i) { + const auto & weight = *tensors[i]; + const auto & tm = metadata[i]; ggml_tensor * tensor = weight.tensor; + if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) { close_ofstream(); new_ofstream(weight.idx); @@ -783,156 +1096,25 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: llama_format_tensor_shape(tensor).c_str(), ggml_type_name(tensor->type)); - // This used to be a regex, but has an extreme cost to compile times. - bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - - // quantize only 2D and 3D tensors (experts) - quantize &= (ggml_n_dims(tensor) >= 2); - - // do not quantize norm tensors - quantize &= name.find("_norm.weight") == std::string::npos; - - quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= !params->only_copy; - - // do not quantize expert gating tensors - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ffn_gate_inp.weight") == std::string::npos; - - // these are very small (e.g. 4x4) - quantize &= name.find("altup") == std::string::npos; - quantize &= name.find("laurel") == std::string::npos; - - // these are not too big so keep them as it is - quantize &= name.find("per_layer_model_proj") == std::string::npos; - - // do not quantize positional embeddings and token types (BERT) - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight"); - quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight"); - - // do not quantize Mamba /Kimi's small conv1d weights - // NOTE: can't use LLM_TN here because the layer number is not known - quantize &= name.find("ssm_conv1d") == std::string::npos; - quantize &= name.find("shortconv.conv.weight") == std::string::npos; - - // do not quantize RWKV's small yet 2D weights - quantize &= name.find("time_mix_first.weight") == std::string::npos; - quantize &= name.find("time_mix_w0.weight") == std::string::npos; - quantize &= name.find("time_mix_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_v0.weight") == std::string::npos; - quantize &= name.find("time_mix_v1.weight") == std::string::npos; - quantize &= name.find("time_mix_v2.weight") == std::string::npos; - quantize &= name.find("time_mix_a0.weight") == std::string::npos; - quantize &= name.find("time_mix_a1.weight") == std::string::npos; - quantize &= name.find("time_mix_a2.weight") == std::string::npos; - quantize &= name.find("time_mix_g1.weight") == std::string::npos; - quantize &= name.find("time_mix_g2.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos; - quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos; - quantize &= name.find("time_mix_lerp_fused.weight") == std::string::npos; - - // do not quantize relative position bias (T5) - quantize &= name.find("attn_rel_b.weight") == std::string::npos; - - // do not quantize specific multimodal tensors - quantize &= name.find(".position_embd.") == std::string::npos; - - ggml_type new_type; - void * new_data; - size_t new_size; + const ggml_type cur_type = tensor->type; + const ggml_type new_type = tm.target_type; - if (quantize) { - new_type = default_type; - - // get more optimal quantization type based on the tensor shape, layer, etc. - if (!params->pure && ggml_is_quantized(default_type)) { - // if the user provided tensor types - use those - bool manual = false; - if (params->tensor_types) { - const std::vector & tensor_types = *static_cast *>(params->tensor_types); - const std::string tensor_name(tensor->name); - for (const auto & [tname, qtype] : tensor_types) { - if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) { - if (qtype != new_type) { - LLAMA_LOG_WARN("(manual override: %s -> %s) ", ggml_type_name(new_type), ggml_type_name(qtype)); - new_type = qtype; // if two or more types are specified for the same tensor, the last match wins - manual = true; - break; - } - } - } - } + // If we've decided to quantize to the same type the tensor is already + // in then there's nothing to do. + bool quantize = cur_type != new_type; - // if not manual - use the standard logic for choosing the quantization type based on the selected mixture - if (!manual) { - new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - } - - // incompatible tensor shapes are handled here - fallback to a compatible type - { - bool convert_incompatible_tensor = false; - - const int64_t nx = tensor->ne[0]; - const int64_t ny = tensor->ne[1]; - const int64_t qk_k = ggml_blck_size(new_type); - - if (nx % qk_k != 0) { - LLAMA_LOG_WARN("\n\n%s : tensor cols %" PRId64 " x %" PRId64 " are not divisible by %" PRId64 ", required for %s", __func__, nx, ny, qk_k, ggml_type_name(new_type)); - convert_incompatible_tensor = true; - } else { - ++qs.n_k_quantized; - } - - if (convert_incompatible_tensor) { - switch (new_type) { - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: new_type = GGML_TYPE_Q4_0; break; // TODO: use a symmetric type instead - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; - case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; - case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break; - case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break; - default: throw std::runtime_error("\nUnsupported tensor size encountered\n"); - } - if (tensor->ne[0] % ggml_blck_size(new_type) != 0) { - new_type = GGML_TYPE_F16; - } - LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type)); - ++qs.n_fallback; - } - } - } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { - new_type = params->token_embedding_type; - } - if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { - new_type = params->output_tensor_type; - } - - // If we've decided to quantize to the same type the tensor is already - // in then there's nothing to do. - quantize = tensor->type != new_type; - } + void * new_data; + size_t new_size; - // we have now decided on the target type for this tensor if (params->dry_run) { - // the --dry-run option calculates the final quantization size without quantizting + // the --dry-run option calculates the final quantization size without quantizing if (quantize) { new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]); LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0, ggml_type_name(new_type)); - if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) { + if (!will_require_imatrix && tm.requires_imatrix) { will_require_imatrix = true; } } else { @@ -945,7 +1127,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else { // no --dry-run, perform quantization if (!quantize) { - new_type = tensor->type; new_data = tensor->data; new_size = tensor_size; LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0); @@ -954,7 +1135,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const float * imatrix = nullptr; if (imatrix_data) { - auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped)); + auto it = imatrix_data->find(tm.remapped_imatrix_name); if (it == imatrix_data->end()) { LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); } else { @@ -968,14 +1149,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // this is a significant error and it may be good idea to abort the process if this happens, // since many people will miss the error and not realize that most of the model is being quantized without an imatrix // tok_embd should be ignored in this case, since it always causes this warning - if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { + if (!tensor_name_match_token_embd(tensor->name)) { throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s", int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name)); } } } } - if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) { + if (!imatrix && tm.requires_imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n"); @@ -1020,29 +1201,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); - - // TODO: temporary sanity check that the F16 -> MXFP4 is lossless -#if 0 - if (new_type == GGML_TYPE_MXFP4) { - auto * x = f32_data_03; - - //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row); - std::vector deq(nrows*n_per_row); - const ggml_type_traits * qtype = ggml_get_type_traits(new_type); - qtype->to_float(new_data_03, deq.data(), deq.size()); - - double err = 0.0f; - for (int i = 0; i < (int) deq.size(); ++i) { - err += fabsf(deq[i] - x[i]); - //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) { - if (deq[i] != x[i]) { - LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]); - } - } - //LLAMA_LOG_INFO("err = %f\n", err); - GGML_ASSERT(err == 0.00000); - } -#endif } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0); } @@ -1058,7 +1216,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: fout.write((const char *) new_data, new_size); zeros(fout, GGML_PAD(new_size, align) - new_size); } // no --dry-run - } // iterate over tensors + } // main loop if (!params->dry_run) { close_ofstream(); @@ -1075,7 +1233,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: if (qs.n_fallback > 0) { LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n", - __func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback); + __func__, qs.n_fallback, ml.n_tensors); } } diff --git a/src/llama-quant.h b/src/llama-quant.h index 6f70f09beec..a91ebffa37e 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1 +1,25 @@ #pragma once +#include + +// Quantization types, used in both quantize.cpp and llama-quant.cpp +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; + +// tensor categorization - used to avoid repeated string matching in quantization logic. +// this is different from LLM_TN - we want broad categories, not specific tensor names per arch. +enum class tensor_category { + TOKEN_EMBD, + ATTENTION_Q, + ATTENTION_V, + ATTENTION_K, + ATTENTION_QKV, + ATTENTION_KV_B, + ATTENTION_OUTPUT, + FFN_UP, + FFN_GATE, + FFN_DOWN, + OUTPUT, + OTHER +}; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 59bf9bd3fd0..9a7e2fc1c7b 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -1,6 +1,7 @@ #include "common.h" #include "llama.h" #include "gguf.h" +#include "../src/llama-quant.h" #include #include @@ -61,12 +62,6 @@ static const std::vector QUANT_OPTIONS = { { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; -// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp -struct tensor_quantization { - std::string name; - ggml_type quant = GGML_TYPE_COUNT; -}; - static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset"; static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; @@ -686,18 +681,6 @@ int main(int argc, char ** argv) { } } - if (!params.dry_run && - ( - params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || - params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M - ) && imatrix_data.empty()) { - fprintf(stderr, "\n==========================================================================================================\n"); - fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); - fprintf(stderr, "==========================================================================================================\n\n\n"); - return 1; - } - if (!params.dry_run) { if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) { fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str()); From d84833f69a57033736072c196b83913025d7af8a Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 4 Mar 2026 23:25:12 -0600 Subject: [PATCH 02/23] WIP --- src/llama-quant-scheduler.cpp | 116 ++++++++++++++++++++++++++++++++++ src/llama-quant.cpp | 6 +- src/llama-quant.h | 21 ++++-- tools/quantize/quantize.cpp | 16 ++--- 4 files changed, 144 insertions(+), 15 deletions(-) create mode 100644 src/llama-quant-scheduler.cpp diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp new file mode 100644 index 00000000000..a0cf5116567 --- /dev/null +++ b/src/llama-quant-scheduler.cpp @@ -0,0 +1,116 @@ +/* llama-quant-scheduler.cpp -- C++17 + +ASPIRATIONS +----------- + +Whenever possible, we must overlap computation and disk I/O. In fact, disk I/O is the main +bottleneck in very many cases, and currently on `master` it's not handled very well - computation +never overlaps with I/O. There is a great opportunity to improve it! + +At the time of writing (2026-03-02), the code on `master` is kept simple (if a bit messy) and it +simply does... + + load src data -> (convert to f32) -> quantize to target type -> write tensor data + +...in a for loop over all tensors. I believe we may be able to acheive a speedup of ~4x in _some_ +cases by managing the work to be done more effectively. There are many people quantizing many models +every day with untold billions of parameters - we don't want to leave any performance on the table. + +The quantized tensors MUST end up in order in the output GGUF. +*/ + +// #include "ggml-quants.h" +#include "llama.h" +#include "llama-impl.h" +#include "llama-model.h" +#include "llama-quant.h" + +#include +#include +#include +#include +#include +#include + +// pool of compute worker threads +struct compute_pool { + const int32_t n_threads; + std::vector threads; + std::atomic_flag busy; + + compute_pool(const int32_t _n_threads): + n_threads(_n_threads), threads(_n_threads) { + // TODO: prepare the threads? but don't start them. + // TODO: init `busy` atomic flag? + }; + + void start() { + // TODO: start the threads (but wait for work, don't spin) + }; + + void stop() { + // TODO: forcibly stop the thread pool (called should check `busy` before doing this) + }; +}; + +// +// quantization work scheduler +// +// goal: overlap I/O and computation, keep all threads busy (as much as reasonably possible) +// +// the scheduler actually manages (`n_threads` + 2) threads: +// - 1 thread for the `read_worker` +// - `n_thread` threads for the `thread_pool` (tensor math is divided among compute workers) +// - 1 thread for the `write_worker` +// +struct scheduler { + const int32_t n_threads; + + // metadata for all tensors in the model + std::vector tschd_vec; + + // + // scheduling pipeline buffers (one of each at most) + // + + // don't need this if using mmap + std::vector buf_read; // size = largest tensor (as found) (`largest_tensor_size`) + + // dequantization compute buffer + std::vector buf_dequant; // size = largest tensor (as f32) (`largest_tensor_size_dequant`) + + // quantization compute buffer + std::vector buf_quant; // size = largest tensor (quantized) (`largest_tensor_size_quant`) + + // hold tensor data (NOTE: tensors must be in order in the output file) + std::vector buf_write; // size = largest tensor (quantized) + + size_t largest_tensor_size = 0; + size_t largest_tensor_size_dequant = 0; + size_t largest_tensor_size_quant = 0; + + compute_pool pool; + + // initialize + scheduler(const int32_t _n_threads, std::vector _tschd_vec): + n_threads(_n_threads), tschd_vec(_tschd_vec), pool(_n_threads) + { + for (int32_t idx; idx < tschd_vec.size(); idx++) { + /* + TODO: set these: + largest_tensor_size = ...; + largest_tensor_size_dequant = ...; + largest_tensor_size_quant = ...; + */ + } + + // TODO: reserve pipeline buffers + }; + + void run() { + // TODO: start `read_worker` thread + // TODO: THIS thread should manage the compute pool + // TODO: start `write_worker` thread + // return void when done, throw std::runtime_error if something fails + } +}; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 58ed0e9db7a..23b7585adae 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -167,9 +167,9 @@ struct quantize_state_impl { { // compile regex patterns once - they are expensive if (params->tensor_types) { - const auto & tensor_types = *static_cast *>(params->tensor_types); - for (const auto & [tname, qtype] : tensor_types) { - tensor_type_patterns.emplace_back(std::regex(tname), qtype); + const auto & tensor_types = *static_cast *>(params->tensor_types); + for (const auto & [name, type] : tensor_types) { + tensor_type_patterns.emplace_back(std::regex(name), type); } } } diff --git a/src/llama-quant.h b/src/llama-quant.h index a91ebffa37e..053214a8aa5 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,10 +1,12 @@ #pragma once -#include +// #include +// #include +// #include "ggml.h" -// Quantization types, used in both quantize.cpp and llama-quant.cpp -struct tensor_quantization { +// store result of parsing --tensor-type option +struct tensor_type_option { std::string name; - ggml_type quant = GGML_TYPE_COUNT; + ggml_type type = GGML_TYPE_COUNT; }; // tensor categorization - used to avoid repeated string matching in quantization logic. @@ -23,3 +25,14 @@ enum class tensor_category { OUTPUT, OTHER }; + +// per-tensor info needed by the quantization work scheduler for efficient quantization. +// constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise. +struct tensor_sched_data { + const int64_t ne0; // ncols + const int64_t ne1; // nrows + const int64_t ne2; // n_expert (or any other 3rd dimension) + const int64_t ne3; // 4D (currently unused) + const ggml_type src_type; + const ggml_type dst_type; +}; diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 9a7e2fc1c7b..86c93d72978 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -404,7 +404,7 @@ static ggml_type parse_ggml_type(const char * arg) { return GGML_TYPE_COUNT; } -static bool parse_tensor_type(const char * data, std::vector & tensor_type) { +static bool parse_tensor_type(const char * data, std::vector & tensor_type) { const char * sep = strchr(data, '='); if (sep == nullptr) { printf("\n%s: malformed tensor type '%s'\n\n", __func__, data); @@ -424,11 +424,11 @@ static bool parse_tensor_type(const char * data, std::vector & tensor_type) { +static bool parse_tensor_type_file(const char * filename, std::vector & tensor_type) { std::ifstream file(filename); if (!file) { printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno)); @@ -490,7 +490,7 @@ int main(int argc, char ** argv) { std::string imatrix_file; std::vector included_weights, excluded_weights; std::vector kv_overrides; - std::vector tensor_types; + std::vector tensor_types; std::vector prune_layers; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { From 33b08083ef634c890d09ebf7a8bd4cf12d7abb7e Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 4 Mar 2026 23:27:41 -0600 Subject: [PATCH 03/23] remove comment --- src/llama-quant.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama-quant.h b/src/llama-quant.h index 053214a8aa5..6dd116a3e55 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,7 +1,5 @@ #pragma once -// #include -// #include -// #include "ggml.h" +#include // store result of parsing --tensor-type option struct tensor_type_option { From 7ff8ec6d87f4efb0a2b89745be1a0ce6ca9ddc1d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 5 Mar 2026 00:47:42 -0600 Subject: [PATCH 04/23] WIP --- src/llama-quant-scheduler.cpp | 94 ++++++++++++++++++----------------- src/llama-quant.h | 17 ++++--- 2 files changed, 58 insertions(+), 53 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index a0cf5116567..9d4be9df38c 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -1,23 +1,19 @@ -/* llama-quant-scheduler.cpp -- C++17 - -ASPIRATIONS ------------ - -Whenever possible, we must overlap computation and disk I/O. In fact, disk I/O is the main -bottleneck in very many cases, and currently on `master` it's not handled very well - computation -never overlaps with I/O. There is a great opportunity to improve it! - -At the time of writing (2026-03-02), the code on `master` is kept simple (if a bit messy) and it -simply does... - - load src data -> (convert to f32) -> quantize to target type -> write tensor data - -...in a for loop over all tensors. I believe we may be able to acheive a speedup of ~4x in _some_ -cases by managing the work to be done more effectively. There are many people quantizing many models -every day with untold billions of parameters - we don't want to leave any performance on the table. - -The quantized tensors MUST end up in order in the output GGUF. -*/ +/** + * + * Whenever possible, we aim to overlap computation and tensor data disk I/O. + * + * This is the primary bottleneck in very many cases, and currently it's not handled very + * efficiently - computation never overlaps with I/O on `master` at the time of writing. Rather, + * the code basically does: + * + * load src tensor data -> dequantize and/or quantize -> write tensor data + * + * ...in a loop over all tensors. There is a great opportunity to improve it! I believe we may be + * able to acheive a speedup of ~4x in _some_ cases by overlapping the work to be done. There are + * many users quantizing many models with many billions of parameters - we don't want to leave any + * performance on the table. + * +**/ // #include "ggml-quants.h" #include "llama.h" @@ -44,50 +40,46 @@ struct compute_pool { // TODO: init `busy` atomic flag? }; - void start() { - // TODO: start the threads (but wait for work, don't spin) - }; - - void stop() { - // TODO: forcibly stop the thread pool (called should check `busy` before doing this) + bool distribute(tensor_sched_data & data) { + // TODO: distribute }; }; // // quantization work scheduler // -// goal: overlap I/O and computation, keep all threads busy (as much as reasonably possible) +// goal: overlap I/O and computation as often as possible to speed-up the quantization process. // -// the scheduler actually manages (`n_threads` + 2) threads: +// the scheduler manages (`n_threads` + 2) threads: // - 1 thread for the `read_worker` -// - `n_thread` threads for the `thread_pool` (tensor math is divided among compute workers) +// - `n_threads` threads for the `compute_pool` (tensor math is divided among compute workers) // - 1 thread for the `write_worker` // struct scheduler { const int32_t n_threads; - // metadata for all tensors in the model + // per-tensor metadata for all tensors in the model std::vector tschd_vec; + size_t largest_tensor_size_src = 0; // size of largest tensor to be quantized (as src type) + size_t largest_tensor_size_f32 = 0; // size of largest tensor to be quantized (as f32) + size_t largest_tensor_size_dst = 0; // size of largest tensor to be quantized (as dst type) + // // scheduling pipeline buffers (one of each at most) // - // don't need this if using mmap - std::vector buf_read; // size = largest tensor (as found) (`largest_tensor_size`) - - // dequantization compute buffer - std::vector buf_dequant; // size = largest tensor (as f32) (`largest_tensor_size_dequant`) + // size: largest_tensor_size_src + std::vector buf_read; // don't need this if using mmap? - // quantization compute buffer - std::vector buf_quant; // size = largest tensor (quantized) (`largest_tensor_size_quant`) + // size: largest_tensor_size_f32 + std::vector buf_dequant; // dequantization buffer - // hold tensor data (NOTE: tensors must be in order in the output file) - std::vector buf_write; // size = largest tensor (quantized) + // size: largest_tensor_size_dst + std::vector buf_quant; // quantization buffer (do we really need this?) - size_t largest_tensor_size = 0; - size_t largest_tensor_size_dequant = 0; - size_t largest_tensor_size_quant = 0; + // size = largest tensor (as dst type) + std::vector buf_write; // hold tensor data for writing (NOTE: tensors must be in order in the output file) compute_pool pool; @@ -95,7 +87,7 @@ struct scheduler { scheduler(const int32_t _n_threads, std::vector _tschd_vec): n_threads(_n_threads), tschd_vec(_tschd_vec), pool(_n_threads) { - for (int32_t idx; idx < tschd_vec.size(); idx++) { + for (int32_t idx = 0; idx < tschd_vec.size(); idx++) { /* TODO: set these: largest_tensor_size = ...; @@ -104,13 +96,25 @@ struct scheduler { */ } - // TODO: reserve pipeline buffers + // TODO: allocate pipeline buffers }; - void run() { + ~scheduler() { + stop(); + } + + void start() { // TODO: start `read_worker` thread // TODO: THIS thread should manage the compute pool // TODO: start `write_worker` thread // return void when done, throw std::runtime_error if something fails } + + void stop() { + // TODO: graceful shutdown + deallocation of buffers + } + + void submit_compute(tensor_sched_data & tschd) { + // TODO: + } }; diff --git a/src/llama-quant.h b/src/llama-quant.h index 6dd116a3e55..8f58ec35c93 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,7 +1,7 @@ #pragma once #include -// store result of parsing --tensor-type option +// result of parsing --tensor-type option struct tensor_type_option { std::string name; ggml_type type = GGML_TYPE_COUNT; @@ -24,13 +24,14 @@ enum class tensor_category { OTHER }; -// per-tensor info needed by the quantization work scheduler for efficient quantization. +// per-tensor info needed by the quantization work scheduler. // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise. struct tensor_sched_data { - const int64_t ne0; // ncols - const int64_t ne1; // nrows - const int64_t ne2; // n_expert (or any other 3rd dimension) - const int64_t ne3; // 4D (currently unused) - const ggml_type src_type; - const ggml_type dst_type; + const void * const src_data; // pointer to raw source tensor data, read-only + const ggml_type src_type; + const ggml_type dst_type; + const int64_t ne0; // ncols + const int64_t ne1; // nrows + const int64_t ne2; // n_expert (or any 3rd tensor dimension) + const int64_t ne3; // any 4th tensor dimension (currently unused, always 1) }; From 77b5a67a92654d6f0587f3447f189c3ee7c13ebb Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 5 Mar 2026 01:46:12 -0600 Subject: [PATCH 05/23] WIP --- src/llama-quant-scheduler.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 9d4be9df38c..ea228dd4c81 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -1,6 +1,7 @@ /** * - * Whenever possible, we aim to overlap computation and tensor data disk I/O. + * Whenever possible, we aim to overlap computation and tensor data disk I/O in the quantization + * process. * * This is the primary bottleneck in very many cases, and currently it's not handled very * efficiently - computation never overlaps with I/O on `master` at the time of writing. Rather, @@ -35,7 +36,8 @@ struct compute_pool { std::atomic_flag busy; compute_pool(const int32_t _n_threads): - n_threads(_n_threads), threads(_n_threads) { + n_threads(_n_threads), threads(_n_threads) + { // TODO: prepare the threads? but don't start them. // TODO: init `busy` atomic flag? }; @@ -59,7 +61,7 @@ struct scheduler { const int32_t n_threads; // per-tensor metadata for all tensors in the model - std::vector tschd_vec; + std::vector data_vec; size_t largest_tensor_size_src = 0; // size of largest tensor to be quantized (as src type) size_t largest_tensor_size_f32 = 0; // size of largest tensor to be quantized (as f32) @@ -84,10 +86,10 @@ struct scheduler { compute_pool pool; // initialize - scheduler(const int32_t _n_threads, std::vector _tschd_vec): - n_threads(_n_threads), tschd_vec(_tschd_vec), pool(_n_threads) + scheduler(const int32_t _n_threads, std::vector _data_vec): + n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads) { - for (int32_t idx = 0; idx < tschd_vec.size(); idx++) { + for (int32_t idx = 0; idx < data_vec.size(); idx++) { /* TODO: set these: largest_tensor_size = ...; @@ -113,8 +115,4 @@ struct scheduler { void stop() { // TODO: graceful shutdown + deallocation of buffers } - - void submit_compute(tensor_sched_data & tschd) { - // TODO: - } }; From d7964472bccf9d19908c758471a6b178728f5cf1 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 5 Mar 2026 16:42:38 -0600 Subject: [PATCH 06/23] set buffer sizes --- src/llama-quant-scheduler.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index ea228dd4c81..6d8fcda797c 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -63,18 +63,18 @@ struct scheduler { // per-tensor metadata for all tensors in the model std::vector data_vec; - size_t largest_tensor_size_src = 0; // size of largest tensor to be quantized (as src type) - size_t largest_tensor_size_f32 = 0; // size of largest tensor to be quantized (as f32) - size_t largest_tensor_size_dst = 0; // size of largest tensor to be quantized (as dst type) + size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) + size_t max_f32_sz = 0; // size of largest tensor to be quantized (as f32) + size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) // // scheduling pipeline buffers (one of each at most) // - // size: largest_tensor_size_src + // size: max_tensor_size_src std::vector buf_read; // don't need this if using mmap? - // size: largest_tensor_size_f32 + // size: max_tensor_size_f32 std::vector buf_dequant; // dequantization buffer // size: largest_tensor_size_dst @@ -89,13 +89,13 @@ struct scheduler { scheduler(const int32_t _n_threads, std::vector _data_vec): n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads) { - for (int32_t idx = 0; idx < data_vec.size(); idx++) { - /* - TODO: set these: - largest_tensor_size = ...; - largest_tensor_size_dequant = ...; - largest_tensor_size_quant = ...; - */ + GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function"); + for (int32_t idx = 0; idx < data_vec.size(); ++idx) { + const auto & data = data_vec[idx]; + const int64_t nrows = data.ne1 * data.ne2 * data.ne3; + max_src_sz = std::max(max_src_sz, nrows * ggml_row_size(data.src_type, data.ne0)); + max_f32_sz = std::max(max_f32_sz, nrows * ggml_row_size(GGML_TYPE_F32, data.ne0)); + max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0)); } // TODO: allocate pipeline buffers From e382e661cddff5907344248662f7c37af3f5a60f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 6 Mar 2026 09:25:27 -0600 Subject: [PATCH 07/23] WIP --- src/llama-quant-scheduler.cpp | 41 +++++++++++++++-------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 6d8fcda797c..0d31de9dda2 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -29,7 +29,7 @@ #include #include -// pool of compute worker threads +// pool of worker threads used for dequantization and quantization struct compute_pool { const int32_t n_threads; std::vector threads; @@ -43,18 +43,18 @@ struct compute_pool { }; bool distribute(tensor_sched_data & data) { - // TODO: distribute + // TODO: distribute }; }; // // quantization work scheduler // -// goal: overlap I/O and computation as often as possible to speed-up the quantization process. +// goal: overlap I/O and computation as much as possible to speed up the quantization process. // // the scheduler manages (`n_threads` + 2) threads: // - 1 thread for the `read_worker` -// - `n_threads` threads for the `compute_pool` (tensor math is divided among compute workers) +// - `n_threads` threads for the `compute_pool` // - 1 thread for the `write_worker` // struct scheduler { @@ -64,28 +64,23 @@ struct scheduler { std::vector data_vec; size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) - size_t max_f32_sz = 0; // size of largest tensor to be quantized (as f32) + size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) // - // scheduling pipeline buffers (one of each at most) + // scheduler pipeline buffers (one of each at most) // - // size: max_tensor_size_src - std::vector buf_read; // don't need this if using mmap? - - // size: max_tensor_size_f32 - std::vector buf_dequant; // dequantization buffer - - // size: largest_tensor_size_dst - std::vector buf_quant; // quantization buffer (do we really need this?) - - // size = largest tensor (as dst type) - std::vector buf_write; // hold tensor data for writing (NOTE: tensors must be in order in the output file) + // size: max_src_sz + std::vector buf_read; // don't need this if using mmap? + // size: max_f32_sz + std::vector buf_compute; // dequant/quant buffer + // size = max_dst_sz + std::vector buf_write; // hold tensor data for writing (NOTE: tensors must be in order in the output file) compute_pool pool; - // initialize + // init scheduler(const int32_t _n_threads, std::vector _data_vec): n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads) { @@ -101,18 +96,18 @@ struct scheduler { // TODO: allocate pipeline buffers }; - ~scheduler() { - stop(); - } - void start() { // TODO: start `read_worker` thread // TODO: THIS thread should manage the compute pool // TODO: start `write_worker` thread - // return void when done, throw std::runtime_error if something fails + // throw std::runtime_error if something fails } void stop() { // TODO: graceful shutdown + deallocation of buffers } + + ~scheduler() { + stop(); + } }; From a4d4aab3aa9a1a696e19fcc4461bf36b4e3ac118 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 6 Mar 2026 16:31:03 -0600 Subject: [PATCH 08/23] WIP --- src/llama-quant-scheduler.cpp | 34 +++++++++++++++++++++++++++++----- src/llama-quant.h | 15 ++++++++------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 0d31de9dda2..718738f2127 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -29,6 +29,14 @@ #include #include +static int get_split_dimension(const tensor_sched_data & tsd, const int32_t n_threads) { + if (tsd.ne0 % n_threads) return 0; + if (tsd.ne1 % n_threads) return 1; + if (tsd.ne2 % n_threads) return 2; + if (tsd.ne3 % n_threads) return 3; + return -1; +} + // pool of worker threads used for dequantization and quantization struct compute_pool { const int32_t n_threads; @@ -63,9 +71,9 @@ struct scheduler { // per-tensor metadata for all tensors in the model std::vector data_vec; - size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) - size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) - size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) + size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) in bytes + size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) in bytes + size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) in bytes // // scheduler pipeline buffers (one of each at most) @@ -93,7 +101,17 @@ struct scheduler { max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0)); } - // TODO: allocate pipeline buffers + LLAMA_LOG_DEBUG("%s: allocating read buffer ... ", __func__); + buf_read.resize(max_src_sz); + LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0); + + LLAMA_LOG_DEBUG("%s: allocating compute buffer ... ", __func__); + buf_compute.resize(max_f32_sz); + LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0); + + LLAMA_LOG_DEBUG("%s: allocating write buffer ... ", __func__); + buf_write.resize(max_dst_sz); + LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0); }; void start() { @@ -104,7 +122,13 @@ struct scheduler { } void stop() { - // TODO: graceful shutdown + deallocation of buffers + LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__); + + buf_read.clear(); + buf_compute.clear(); + buf_write.clear(); + + LLAMA_LOG_DEBUG("done\n"); } ~scheduler() { diff --git a/src/llama-quant.h b/src/llama-quant.h index 8f58ec35c93..f75a812d44d 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -27,11 +27,12 @@ enum class tensor_category { // per-tensor info needed by the quantization work scheduler. // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise. struct tensor_sched_data { - const void * const src_data; // pointer to raw source tensor data, read-only - const ggml_type src_type; - const ggml_type dst_type; - const int64_t ne0; // ncols - const int64_t ne1; // nrows - const int64_t ne2; // n_expert (or any 3rd tensor dimension) - const int64_t ne3; // any 4th tensor dimension (currently unused, always 1) + const ggml_type src_type; // source tensor type + const ggml_type dst_type; // destination tensor type + const int64_t ne0; // n_cols + const int64_t ne1; // n_rows + const int64_t ne2; // n_expert (or any 3rd tensor dimension) + const int64_t ne3; // any 4th tensor dimension (currently unused, always 1) + const void * const src_data; // pointer to raw source tensor data buffer, read-only + const void * const imatrix; // pointer to imatrix data, or nullptr, read-only }; From 956092ddd584bfc52f2da306998ee9d10d3c53ce Mon Sep 17 00:00:00 2001 From: ddh0 Date: Fri, 6 Mar 2026 22:57:22 -0600 Subject: [PATCH 09/23] WIP --- src/llama-quant-scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 718738f2127..e1e91f6e22b 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -80,7 +80,7 @@ struct scheduler { // // size: max_src_sz - std::vector buf_read; // don't need this if using mmap? + std::vector buf_read; // hold tensor data for reading // size: max_f32_sz std::vector buf_compute; // dequant/quant buffer // size = max_dst_sz From 1e1b692316bbd1501265ab1ed484a66b2427c2d4 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 7 Mar 2026 16:21:57 -0600 Subject: [PATCH 10/23] WIP --- src/llama-quant-scheduler.cpp | 141 ++++++++++++++++++++++++++-------- 1 file changed, 109 insertions(+), 32 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index e1e91f6e22b..7c8ae01db23 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -1,21 +1,70 @@ /** * - * Whenever possible, we aim to overlap computation and tensor data disk I/O in the quantization + * Whenever possible, we aim to overlap computation and tensor data I/O during the quantization * process. * - * This is the primary bottleneck in very many cases, and currently it's not handled very - * efficiently - computation never overlaps with I/O on `master` at the time of writing. Rather, - * the code basically does: + * This is the primary bottleneck in very many cases, and at the time of writing (2026-03) it's not + * handled very efficiently on `master` - computation never overlaps with I/O. Rather, the code + * essentially does: * - * load src tensor data -> dequantize and/or quantize -> write tensor data + * read src tensor data -> dequantize and/or quantize -> write dst tensor data * - * ...in a loop over all tensors. There is a great opportunity to improve it! I believe we may be - * able to acheive a speedup of ~4x in _some_ cases by overlapping the work to be done. There are - * many users quantizing many models with many billions of parameters - we don't want to leave any - * performance on the table. + * ...in a synchronous loop over all tensors. There is a great opportunity to improve it! I believe + * we may be able to acheive a speedup of ~3x in some cases by properly scheduling the work to be + * done. There are many users quantizing many models with many billions of parameters - we don't + * want to leave any performance on the table. * **/ +/** + * [NOTE: delete this comment block before PR] + * + * WORK-IN-PROGRESS -- DEV NOTES + * ----------------------------- + * + * the scheduler will work like this: + * 0. all buffers start with "read_ready" = false, "write_ready" = true. + * 1. ggml_tensor 0 is materialized in the read buffer + * - the read worker thread sets the "read_ready" flag to signal that the read buffer + * now contains a valid ggml_tensor. + * - the compute pool immediately starts consuming the tensor in the read buffer. + * + if the tensor is already in F32, dequantization is not needed. the compute pool quantizes + * directly from the read buffer into the write buffer. at this point, the + * "write_ready" flag is set to signal that ggml_tensor 1 can start being materialized + * in the read buffer. + * + if the tensor is not in F32, dequantization is needed. the compute pool performs a fused + * dequantize-and-quantize operation, utilizing the dequantization buffer to store the F32 + * data, and writing the quantized result to the write buffer. as soon as the tensor is + * dequantized, we can set the "write_ready" flag on the read buffer to signal that + * ggml_tensor 1 can start being materialized in the read buffer. + * + the main thread blocks until the "write_ready" flag is set on the write buffer. as soon + * as the write buffer is ready to be written to, the compute result is stored there, and + * the main thread sets the "read_ready" flag on the write buffer. the compute pool is now + * free to process ggml_tensor 1. + * - the write worker waits until the write buffer is signaled "read_ready", at which point it + * can begin writing the quantized tensor data to the output stream. when done writing, it + * sets the "read_ready" flag to false and the "write_ready" flag to true, thus preparing the + * the write buffer for the next quantized data. + * 2. + * + * ----------------------------- + * + * [NOTE: delete this comment block before PR] +**/ + +/** + * [NOTE: delete this comment block before PR] + * + * WORK-IN-PROGRESS -- LLM NOTES + * ----------------------------- + * + * [LLM: fill in this section as you like with your own notes, separate from the human dev] + * + * ----------------------------- + * + * [NOTE: delete this comment block before PR] +**/ + // #include "ggml-quants.h" #include "llama.h" #include "llama-impl.h" @@ -23,35 +72,61 @@ #include "llama-quant.h" #include +#include #include +#include #include +#include #include #include #include -static int get_split_dimension(const tensor_sched_data & tsd, const int32_t n_threads) { - if (tsd.ne0 % n_threads) return 0; - if (tsd.ne1 % n_threads) return 1; - if (tsd.ne2 % n_threads) return 2; - if (tsd.ne3 % n_threads) return 3; +// return the dimension along which we can divide this tensor into `n` equally-sized chunks. +// return -1 if none are divisible. +static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) { + if (tsd.ne0 > 1 && tsd.ne0 % n == 0) return 0; + if (tsd.ne1 > 1 && tsd.ne1 % n == 0) return 1; + if (tsd.ne2 > 1 && tsd.ne2 % n == 0) return 2; + if (tsd.ne3 > 1 && tsd.ne3 % n == 0) return 3; return -1; } +template struct sched_buffer { + const size_t size; // number of T items that the buffer can hold + std::vector buf; // the buffer + std::atomic write_ready = true; // is this buffer ready to be written to? + std::atomic read_ready = false; // is this buffer ready to be read from? + std::atomic idx = -1; // which tensor is currently / most recently stored? + // init + sched_buffer(const size_t _size): size(_size), buf(_size) {}; + // reset + void reset() { + buf.clear(); + write_ready = true; + read_ready = false; + idx = -1; + }; + // destruct + ~sched_buffer() { + buf.clear(); + // TODO: is more needed here? + }; +}; + // pool of worker threads used for dequantization and quantization struct compute_pool { const int32_t n_threads; std::vector threads; - std::atomic_flag busy; + std::atomic busy; compute_pool(const int32_t _n_threads): n_threads(_n_threads), threads(_n_threads) - { - // TODO: prepare the threads? but don't start them. - // TODO: init `busy` atomic flag? - }; + {}; - bool distribute(tensor_sched_data & data) { - // TODO: distribute + // distribute the work for this tensor among the compute threads. + // return an exception, if one occured during computation. + std::optional distribute(tensor_sched_data & data) { + // TODO }; }; @@ -80,17 +155,19 @@ struct scheduler { // // size: max_src_sz - std::vector buf_read; // hold tensor data for reading + sched_buffer buf_read; // hold source tensor data for reading // size: max_f32_sz - std::vector buf_compute; // dequant/quant buffer + sched_buffer buf_dequant; // hold dequantized tensor data // size = max_dst_sz - std::vector buf_write; // hold tensor data for writing (NOTE: tensors must be in order in the output file) + sched_buffer buf_write; // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file) compute_pool pool; // init scheduler(const int32_t _n_threads, std::vector _data_vec): - n_threads(_n_threads), data_vec(_data_vec), pool(_n_threads) + n_threads(_n_threads), + data_vec(_data_vec), + pool(_n_threads) { GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function"); for (int32_t idx = 0; idx < data_vec.size(); ++idx) { @@ -101,16 +178,16 @@ struct scheduler { max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0)); } - LLAMA_LOG_DEBUG("%s: allocating read buffer ... ", __func__); - buf_read.resize(max_src_sz); + LLAMA_LOG_DEBUG("%s: allocating read buffer ... ", __func__); + buf_read(max_src_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0); - LLAMA_LOG_DEBUG("%s: allocating compute buffer ... ", __func__); - buf_compute.resize(max_f32_sz); + LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__); + buf_dequant(max_f32_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0); - LLAMA_LOG_DEBUG("%s: allocating write buffer ... ", __func__); - buf_write.resize(max_dst_sz); + LLAMA_LOG_DEBUG("%s: allocating write buffer ... ", __func__); + buf_write(max_dst_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0); }; @@ -125,7 +202,7 @@ struct scheduler { LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__); buf_read.clear(); - buf_compute.clear(); + buf_dequant.clear(); buf_write.clear(); LLAMA_LOG_DEBUG("done\n"); From 6f78f2b5db81a823946abad0b80162351b46c213 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 7 Mar 2026 22:52:21 -0600 Subject: [PATCH 11/23] WIP --- src/llama-quant-scheduler.cpp | 56 +++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 7c8ae01db23..c4867dd1fe2 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -91,40 +91,47 @@ static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) { return -1; } -template struct sched_buffer { - const size_t size; // number of T items that the buffer can hold - std::vector buf; // the buffer - std::atomic write_ready = true; // is this buffer ready to be written to? - std::atomic read_ready = false; // is this buffer ready to be read from? - std::atomic idx = -1; // which tensor is currently / most recently stored? - // init - sched_buffer(const size_t _size): size(_size), buf(_size) {}; - // reset +template +struct sched_buffer { + size_t size; + std::vector buf; + std::atomic write_ready; + std::atomic read_ready; + std::atomic idx; + + sched_buffer() : size(0), buf(), write_ready(true), read_ready(false), idx(-1) {} + + void init(const size_t _size) { + size = _size; + buf = std::vector(_size); + write_ready = true; + read_ready = false; + idx = -1; + } + void reset() { buf.clear(); write_ready = true; read_ready = false; idx = -1; }; - // destruct - ~sched_buffer() { - buf.clear(); - // TODO: is more needed here? - }; + + ~sched_buffer() = default; }; -// pool of worker threads used for dequantization and quantization +// pool of worker threads used for dequantization + quantization struct compute_pool { const int32_t n_threads; std::vector threads; std::atomic busy; + std::optional opt_exc; compute_pool(const int32_t _n_threads): n_threads(_n_threads), threads(_n_threads) {}; - // distribute the work for this tensor among the compute threads. - // return an exception, if one occured during computation. + // distribute the computation to all worker threads. + // return an exception, if one occured during computation, nullopt otherwise. std::optional distribute(tensor_sched_data & data) { // TODO }; @@ -133,7 +140,8 @@ struct compute_pool { // // quantization work scheduler // -// goal: overlap I/O and computation as much as possible to speed up the quantization process. +// goal: overlap I/O and computation as much as possible to speed up the quantization process, +// while still being mindful of total memory usage. // // the scheduler manages (`n_threads` + 2) threads: // - 1 thread for the `read_worker` @@ -179,16 +187,17 @@ struct scheduler { } LLAMA_LOG_DEBUG("%s: allocating read buffer ... ", __func__); - buf_read(max_src_sz); + buf_read.init(max_src_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0); LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__); - buf_dequant(max_f32_sz); + buf_dequant.init(max_f32_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0); LLAMA_LOG_DEBUG("%s: allocating write buffer ... ", __func__); - buf_write(max_dst_sz); + buf_write.init(max_dst_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0); + }; void start() { @@ -200,11 +209,6 @@ struct scheduler { void stop() { LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__); - - buf_read.clear(); - buf_dequant.clear(); - buf_write.clear(); - LLAMA_LOG_DEBUG("done\n"); } From 1896fede20e6f527b0aba499ee5bbbf431f879e7 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 8 Mar 2026 01:08:34 -0600 Subject: [PATCH 12/23] WIP --- src/llama-quant-scheduler.cpp | 146 ++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 59 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index c4867dd1fe2..86e50bd225a 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -60,6 +60,30 @@ * * [LLM: fill in this section as you like with your own notes, separate from the human dev] * + * ARCHITECTURE REVIEW & RECOMMENDATIONS + * ------------------------------------- + * 1. Synchronization Primitives: + * - Replace atomic bool polling with std::condition_variable to prevent busy-waiting. + * - Ensure memory_order_acquire/release is used if sticking with atomics for flags. + * + * 2. Buffering Strategy: + * - Current single-buffer design couples I/O and Compute latency. + * - Recommendation: Implement double-buffering (ping-pong) for 'buf_read' to allow + * loading Tensor N+1 while Computing Tensor N. + * + * 3. Exception Handling: + * - Change compute_pool::opt_exc from std::optional to + * std::optional to avoid object slicing. + * - Add a global 'stop_flag' to the scheduler to terminate all workers if one fails. + * + * 4. Compatibility: + * - Remove include. llama.cpp targets C++17; std::float_t is C++23. + * - Remove std::optional wrappers on buffers; they are always initialized. + * + * 5. Thread Pool: + * - compute_pool constructor must launch worker threads with a wait-loop, + * not just resize the vector. + * * ----------------------------- * * [NOTE: delete this comment block before PR] @@ -72,18 +96,17 @@ #include "llama-quant.h" #include -#include #include +#include #include #include -#include #include #include #include -// return the dimension along which we can divide this tensor into `n` equally-sized chunks. -// return -1 if none are divisible. -static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) { +// determine the dimension along which we can divide this tensor into `n` equally-sized chunks. +// return 0, 1, 2, or 3. if none are divisible, return -1. +static int get_split_dim(const tensor_sched_data & tsd, const int64_t n) { if (tsd.ne0 > 1 && tsd.ne0 % n == 0) return 0; if (tsd.ne1 > 1 && tsd.ne1 % n == 0) return 1; if (tsd.ne2 > 1 && tsd.ne2 % n == 0) return 2; @@ -93,46 +116,56 @@ static int get_split_dimension(const tensor_sched_data & tsd, const int64_t n) { template struct sched_buffer { - size_t size; - std::vector buf; - std::atomic write_ready; - std::atomic read_ready; - std::atomic idx; - - sched_buffer() : size(0), buf(), write_ready(true), read_ready(false), idx(-1) {} - - void init(const size_t _size) { - size = _size; - buf = std::vector(_size); - write_ready = true; - read_ready = false; - idx = -1; + static_assert(std::is_same_v || std::is_same_v, + "sched_buffer only supports uint8_t and float"); + + std::vector buf; + std::mutex mtx; + std::atomic idx; // which tensor is currently / most recently stored? (-1 if none) + std::condition_variable cv; + std::atomic has_data; + + sched_buffer(const size_t _size): buf(_size), has_data(false), idx(-1) {} + + // producer calls this when data is written + void notify_ready() { + { + std::lock_guard lock(mtx); + has_data = true; + } + cv.notify_one(); } - void reset() { - buf.clear(); - write_ready = true; - read_ready = false; - idx = -1; - }; + // consumer calls this to wait for data + void wait_ready() { + std::unique_lock lock(mtx); + cv.wait(lock, [this]{ return has_data; }); + } - ~sched_buffer() = default; + // consumer calls this when done processing to release buffer + void release() { + { + std::lock_guard lock(mtx); + has_data = false; + } + cv.notify_one(); + } }; // pool of worker threads used for dequantization + quantization struct compute_pool { - const int32_t n_threads; + const int32_t n_threads; std::vector threads; - std::atomic busy; - std::optional opt_exc; + std::atomic busy; compute_pool(const int32_t _n_threads): - n_threads(_n_threads), threads(_n_threads) - {}; + n_threads(_n_threads), threads(_n_threads), busy(false) + { + // TODO: do we need to init the threads, or can this be left empty? + }; // distribute the computation to all worker threads. - // return an exception, if one occured during computation, nullopt otherwise. - std::optional distribute(tensor_sched_data & data) { + void distribute(tensor_sched_data & data) const { // TODO }; }; @@ -141,7 +174,7 @@ struct compute_pool { // quantization work scheduler // // goal: overlap I/O and computation as much as possible to speed up the quantization process, -// while still being mindful of total memory usage. +// while being mindful of total memory usage. // // the scheduler manages (`n_threads` + 2) threads: // - 1 thread for the `read_worker` @@ -151,35 +184,37 @@ struct compute_pool { struct scheduler { const int32_t n_threads; - // per-tensor metadata for all tensors in the model - std::vector data_vec; + // per-tensor data needed by the scheduler for all model tensors + std::vector tsd_vec; - size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) in bytes - size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) in bytes - size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) in bytes + size_t max_src_sz; // size of largest tensor to be quantized (as src type) in bytes + size_t max_f32_sz; // size of largest tensor to be quantized (as float32) in bytes + size_t max_dst_sz; // size of largest tensor to be quantized (as dst type) in bytes // // scheduler pipeline buffers (one of each at most) // // size: max_src_sz - sched_buffer buf_read; // hold source tensor data for reading + std::optional> buf_read; // hold source tensor data for reading // size: max_f32_sz - sched_buffer buf_dequant; // hold dequantized tensor data + std::optional> buf_dequant; // hold dequantized tensor data // size = max_dst_sz - sched_buffer buf_write; // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file) + std::optional> buf_write; // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file) compute_pool pool; // init - scheduler(const int32_t _n_threads, std::vector _data_vec): + scheduler(const int32_t _n_threads, std::vector _tsd_vec): n_threads(_n_threads), - data_vec(_data_vec), + tsd_vec(_tsd_vec), + max_src_sz(0), max_f32_sz(0), max_dst_sz(0), + buf_read(std::nullopt), buf_dequant(std::nullopt), buf_write(std::nullopt), pool(_n_threads) { GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function"); - for (int32_t idx = 0; idx < data_vec.size(); ++idx) { - const auto & data = data_vec[idx]; + for (int32_t idx = 0; idx < tsd_vec.size(); ++idx) { + const auto & data = tsd_vec[idx]; const int64_t nrows = data.ne1 * data.ne2 * data.ne3; max_src_sz = std::max(max_src_sz, nrows * ggml_row_size(data.src_type, data.ne0)); max_f32_sz = std::max(max_f32_sz, nrows * ggml_row_size(GGML_TYPE_F32, data.ne0)); @@ -187,32 +222,25 @@ struct scheduler { } LLAMA_LOG_DEBUG("%s: allocating read buffer ... ", __func__); - buf_read.init(max_src_sz); + buf_read.emplace(max_src_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0); LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__); - buf_dequant.init(max_f32_sz); + buf_dequant.emplace(max_f32_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0); LLAMA_LOG_DEBUG("%s: allocating write buffer ... ", __func__); - buf_write.init(max_dst_sz); + buf_write.emplace(max_dst_sz); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0); - }; + } - void start() { + void run() { // TODO: start `read_worker` thread // TODO: THIS thread should manage the compute pool // TODO: start `write_worker` thread // throw std::runtime_error if something fails } - void stop() { - LLAMA_LOG_DEBUG("%s: deallocating buffers ... ", __func__); - LLAMA_LOG_DEBUG("done\n"); - } - - ~scheduler() { - stop(); - } + ~scheduler() = default; }; From 3c046f941075d04c8ea2a3aa8ae50b3b2b60e567 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 8 Mar 2026 01:23:59 -0600 Subject: [PATCH 13/23] WIP --- src/llama-quant-scheduler.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 86e50bd225a..11eafc68dcd 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -89,7 +89,6 @@ * [NOTE: delete this comment block before PR] **/ -// #include "ggml-quants.h" #include "llama.h" #include "llama-impl.h" #include "llama-model.h" From 8195ad644b570914aa6e9fc80d1e0a062be2bb24 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 8 Mar 2026 01:58:44 -0600 Subject: [PATCH 14/23] re-org includes --- src/llama-quant-scheduler.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 11eafc68dcd..5b7db2bc160 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -94,14 +94,15 @@ #include "llama-model.h" #include "llama-quant.h" -#include -#include -#include -#include +#include #include #include #include -#include +#include +#include +#include +#include +#include // determine the dimension along which we can divide this tensor into `n` equally-sized chunks. // return 0, 1, 2, or 3. if none are divisible, return -1. From c2e55ccefce913c261309a0d50abdafd8e41586d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 8 Mar 2026 16:05:41 -0500 Subject: [PATCH 15/23] reflect header changes from # 19770 --- src/llama-quant.cpp | 24 ++++++++++++++++++++++++ src/llama-quant.h | 23 ----------------------- tools/quantize/quantize.cpp | 7 +++++++ 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 23b7585adae..caf5d10a9d6 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -14,6 +14,30 @@ #include #include +// result of parsing --tensor-type option +// (changes to this struct must be reflected in tools/quantize/quantize.cpp) +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; + +// tensor categorization - used to avoid repeated string matching in quantization logic. +// this is different from LLM_TN - we want broad categories, not specific tensor names per arch. +enum class tensor_category { + TOKEN_EMBD, + ATTENTION_Q, + ATTENTION_V, + ATTENTION_K, + ATTENTION_QKV, + ATTENTION_KV_B, + ATTENTION_OUTPUT, + FFN_UP, + FFN_GATE, + FFN_DOWN, + OUTPUT, + OTHER +}; + static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { diff --git a/src/llama-quant.h b/src/llama-quant.h index f75a812d44d..0604ea63bdb 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,29 +1,6 @@ #pragma once #include -// result of parsing --tensor-type option -struct tensor_type_option { - std::string name; - ggml_type type = GGML_TYPE_COUNT; -}; - -// tensor categorization - used to avoid repeated string matching in quantization logic. -// this is different from LLM_TN - we want broad categories, not specific tensor names per arch. -enum class tensor_category { - TOKEN_EMBD, - ATTENTION_Q, - ATTENTION_V, - ATTENTION_K, - ATTENTION_QKV, - ATTENTION_KV_B, - ATTENTION_OUTPUT, - FFN_UP, - FFN_GATE, - FFN_DOWN, - OUTPUT, - OTHER -}; - // per-tensor info needed by the quantization work scheduler. // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise. struct tensor_sched_data { diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 7aa84859bc1..04276742d5c 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -19,6 +19,13 @@ #include #include +// result of parsing --tensor-type option +// (changes to this struct must be reflected in src/llama-quant.cpp) +struct tensor_quantization { + std::string name; + ggml_type quant = GGML_TYPE_COUNT; +}; + struct quant_option { std::string name; llama_ftype ftype; From 8e3c6890e807c389b7a1da6bc5e3ef8ee031e2c5 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sun, 8 Mar 2026 16:13:55 -0500 Subject: [PATCH 16/23] WIP --- src/llama-quant.cpp | 6 +++--- tools/quantize/quantize.cpp | 15 +++++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index caf5d10a9d6..74ff7f75caa 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -16,9 +16,9 @@ // result of parsing --tensor-type option // (changes to this struct must be reflected in tools/quantize/quantize.cpp) -struct tensor_quantization { +struct tensor_type_option { std::string name; - ggml_type quant = GGML_TYPE_COUNT; + ggml_type type = GGML_TYPE_COUNT; }; // tensor categorization - used to avoid repeated string matching in quantization logic. @@ -189,7 +189,7 @@ struct quantize_state_impl { : model(model) , params(params) { - // compile regex patterns once - they are expensive + // compile regex patterns just once - they could be expensive if (params->tensor_types) { const auto & tensor_types = *static_cast *>(params->tensor_types); for (const auto & [name, type] : tensor_types) { diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 04276742d5c..b84b2b6e554 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -1,7 +1,6 @@ #include "common.h" #include "llama.h" #include "gguf.h" -#include "../src/llama-quant.h" #include #include @@ -21,9 +20,9 @@ // result of parsing --tensor-type option // (changes to this struct must be reflected in src/llama-quant.cpp) -struct tensor_quantization { +struct tensor_type_option { std::string name; - ggml_type quant = GGML_TYPE_COUNT; + ggml_type type = GGML_TYPE_COUNT; }; struct quant_option { @@ -503,7 +502,7 @@ int main(int argc, char ** argv) { std::string imatrix_file; std::vector included_weights, excluded_weights; std::vector kv_overrides; - std::vector tensor_types; + std::vector tensor_type_options; std::vector prune_layers; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { @@ -528,11 +527,11 @@ int main(int argc, char ** argv) { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--tensor-type") == 0) { - if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) { + if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_type_options)) { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) { - if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) { + if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_type_options)) { usage(argv[0]); } } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) { @@ -626,8 +625,8 @@ int main(int argc, char ** argv) { kv_overrides.back().key[0] = 0; params.kv_overrides = &kv_overrides; } - if (!tensor_types.empty()) { - params.tensor_types = &tensor_types; + if (!tensor_type_options.empty()) { + params.tensor_types = &tensor_type_options; } if (!prune_layers.empty()) { params.prune_layers = &prune_layers; From fb728ac63bb6bd99999c7a1c7f2d0aca52de541d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 9 Mar 2026 15:39:44 -0500 Subject: [PATCH 17/23] WIP --- src/llama-quant-scheduler.cpp | 109 ++++++++++++++++------------------ 1 file changed, 51 insertions(+), 58 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 5b7db2bc160..b69c8c7b87e 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -60,30 +60,6 @@ * * [LLM: fill in this section as you like with your own notes, separate from the human dev] * - * ARCHITECTURE REVIEW & RECOMMENDATIONS - * ------------------------------------- - * 1. Synchronization Primitives: - * - Replace atomic bool polling with std::condition_variable to prevent busy-waiting. - * - Ensure memory_order_acquire/release is used if sticking with atomics for flags. - * - * 2. Buffering Strategy: - * - Current single-buffer design couples I/O and Compute latency. - * - Recommendation: Implement double-buffering (ping-pong) for 'buf_read' to allow - * loading Tensor N+1 while Computing Tensor N. - * - * 3. Exception Handling: - * - Change compute_pool::opt_exc from std::optional to - * std::optional to avoid object slicing. - * - Add a global 'stop_flag' to the scheduler to terminate all workers if one fails. - * - * 4. Compatibility: - * - Remove include. llama.cpp targets C++17; std::float_t is C++23. - * - Remove std::optional wrappers on buffers; they are always initialized. - * - * 5. Thread Pool: - * - compute_pool constructor must launch worker threads with a wait-loop, - * not just resize the vector. - * * ----------------------------- * * [NOTE: delete this comment block before PR] @@ -91,7 +67,6 @@ #include "llama.h" #include "llama-impl.h" -#include "llama-model.h" #include "llama-quant.h" #include @@ -99,51 +74,60 @@ #include #include #include -#include -#include #include #include // determine the dimension along which we can divide this tensor into `n` equally-sized chunks. // return 0, 1, 2, or 3. if none are divisible, return -1. static int get_split_dim(const tensor_sched_data & tsd, const int64_t n) { - if (tsd.ne0 > 1 && tsd.ne0 % n == 0) return 0; - if (tsd.ne1 > 1 && tsd.ne1 % n == 0) return 1; - if (tsd.ne2 > 1 && tsd.ne2 % n == 0) return 2; - if (tsd.ne3 > 1 && tsd.ne3 % n == 0) return 3; + if (tsd.ne0 > n && tsd.ne0 % n == 0) return 0; + if (tsd.ne1 > n && tsd.ne1 % n == 0) return 1; + if (tsd.ne2 > n && tsd.ne2 % n == 0) return 2; + if (tsd.ne3 > n && tsd.ne3 % n == 0) return 3; return -1; } -template -struct sched_buffer { +template struct sched_buffer { static_assert(std::is_same_v || std::is_same_v, "sched_buffer only supports uint8_t and float"); std::vector buf; std::mutex mtx; - std::atomic idx; // which tensor is currently / most recently stored? (-1 if none) - std::condition_variable cv; + std::atomic idx; // which tensor is currently or most recently stored? (-1 at init, then 0 for 1st tensor, 1 for 2nd tensor...) std::atomic has_data; + std::condition_variable cv; - sched_buffer(const size_t _size): buf(_size), has_data(false), idx(-1) {} + // init but don't allocate the buffer yet + sched_buffer(): + has_data(false), idx(-1) + {} - // producer calls this when data is written - void notify_ready() { + // allocate the buffer and return the allocated size in bytes + size_t allocate(const size_t _size) { + buf.resize(_size); + return sizeof(T) * _size; + } + + // signal to workers that this buffer now has data for tensor at index `_idx`. + // this updates the buffer's `idx` to match. all indices must be sequential. + void signal_has_data(const int64_t _idx) { { std::lock_guard lock(mtx); + GGML_ASSERT(_idx == idx + 1 && "buffer tensor indices must be sequential"); has_data = true; + idx = _idx; } cv.notify_one(); } - // consumer calls this to wait for data - void wait_ready() { + // workers call this function to wait for data in this buffer. + void wait_for_data() { std::unique_lock lock(mtx); cv.wait(lock, [this]{ return has_data; }); } - // consumer calls this when done processing to release buffer - void release() { + // signal to workers that this buffer should no longer be read from. + void signal_no_data() { { std::lock_guard lock(mtx); has_data = false; @@ -187,20 +171,24 @@ struct scheduler { // per-tensor data needed by the scheduler for all model tensors std::vector tsd_vec; - size_t max_src_sz; // size of largest tensor to be quantized (as src type) in bytes - size_t max_f32_sz; // size of largest tensor to be quantized (as float32) in bytes - size_t max_dst_sz; // size of largest tensor to be quantized (as dst type) in bytes + size_t max_src_sz = 0; // size of largest tensor to be quantized (as src type) in bytes + size_t max_f32_sz = 0; // size of largest tensor to be quantized (as float32) in bytes + size_t max_dst_sz = 0; // size of largest tensor to be quantized (as dst type) in bytes // // scheduler pipeline buffers (one of each at most) // // size: max_src_sz - std::optional> buf_read; // hold source tensor data for reading + sched_buffer buf_read; // tensor data is read into here as fast as possible (read worker keeps it full). + // size: max_src_sz + sched_buffer buf_compute_src; // compute workers read src tensor data from here // size: max_f32_sz - std::optional> buf_dequant; // hold dequantized tensor data + sched_buffer buf_compute_f32; // intermediate f32 tensor data (if necessary) + // size = max_dst_sz + sched_buffer buf_compute_dst; // compute workers write dst tensor data into here // size = max_dst_sz - std::optional> buf_write; // hold quantized tensor data for writing (NOTE: tensors must be in order in the output file) + sched_buffer buf_write; // tensor data is written to the output stream IN ORDER by the write worker. compute_pool pool; @@ -208,36 +196,41 @@ struct scheduler { scheduler(const int32_t _n_threads, std::vector _tsd_vec): n_threads(_n_threads), tsd_vec(_tsd_vec), - max_src_sz(0), max_f32_sz(0), max_dst_sz(0), - buf_read(std::nullopt), buf_dequant(std::nullopt), buf_write(std::nullopt), pool(_n_threads) { GGML_ASSERT(GGML_MAX_DIMS == 4 && "GGML_MAX_DIMS is not 4 - update this function"); for (int32_t idx = 0; idx < tsd_vec.size(); ++idx) { const auto & data = tsd_vec[idx]; - const int64_t nrows = data.ne1 * data.ne2 * data.ne3; + const size_t nrows = data.ne1 * data.ne2 * data.ne3; max_src_sz = std::max(max_src_sz, nrows * ggml_row_size(data.src_type, data.ne0)); max_f32_sz = std::max(max_f32_sz, nrows * ggml_row_size(GGML_TYPE_F32, data.ne0)); max_dst_sz = std::max(max_dst_sz, nrows * ggml_row_size(data.dst_type, data.ne0)); } - LLAMA_LOG_DEBUG("%s: allocating read buffer ... ", __func__); - buf_read.emplace(max_src_sz); + LLAMA_LOG_DEBUG("%s: allocating read buffer ... ", __func__); + GGML_ASSERT(max_src_sz == buf_read.allocate(max_src_sz)); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0); - LLAMA_LOG_DEBUG("%s: allocating dequantization buffer ... ", __func__); - buf_dequant.emplace(max_f32_sz); + LLAMA_LOG_DEBUG("%s: allocating compute src buffer ... ", __func__); + GGML_ASSERT(max_src_sz == buf_compute_src.allocate(max_src_sz)); + LLAMA_LOG_DEBUG("%8.2f MiB\n", max_src_sz/1024.0/1024.0); + + LLAMA_LOG_DEBUG("%s: allocating compute f32 buffer ... ", __func__); + GGML_ASSERT(max_f32_sz == buf_compute_f32.allocate(max_f32_sz / sizeof(float))); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_f32_sz/1024.0/1024.0); - LLAMA_LOG_DEBUG("%s: allocating write buffer ... ", __func__); - buf_write.emplace(max_dst_sz); + LLAMA_LOG_DEBUG("%s: allocating compute dst buffer ... ", __func__); + GGML_ASSERT(max_dst_sz == buf_compute_dst.allocate(max_dst_sz)); LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0); + LLAMA_LOG_DEBUG("%s: allocating write buffer ... ", __func__); + GGML_ASSERT(max_dst_sz == buf_write.allocate(max_dst_sz)); + LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0); } void run() { // TODO: start `read_worker` thread - // TODO: THIS thread should manage the compute pool + // TODO: start `compute_worker` thread (?) // TODO: start `write_worker` thread // throw std::runtime_error if something fails } From 30621906c80f2b0915d1477399a9678ba9855f94 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 9 Mar 2026 17:01:29 -0500 Subject: [PATCH 18/23] WIP --- src/llama-quant-scheduler.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index b69c8c7b87e..4093f1d500f 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -180,18 +180,22 @@ struct scheduler { // // size: max_src_sz - sched_buffer buf_read; // tensor data is read into here as fast as possible (read worker keeps it full). + sched_buffer buf_read; // tensor data is read into here as fast as possible by `reader_th` // size: max_src_sz - sched_buffer buf_compute_src; // compute workers read src tensor data from here + sched_buffer buf_compute_src; // compute pool reads src tensor data from here // size: max_f32_sz sched_buffer buf_compute_f32; // intermediate f32 tensor data (if necessary) // size = max_dst_sz - sched_buffer buf_compute_dst; // compute workers write dst tensor data into here + sched_buffer buf_compute_dst; // compute pool writes dst tensor data into here // size = max_dst_sz - sched_buffer buf_write; // tensor data is written to the output stream IN ORDER by the write worker. + sched_buffer buf_write; // tensor data is written from here to the output stream (IN ORDER) by `writer_th` compute_pool pool; + std::thread reader_th; // constantly reading tensor data from the original model into buf_read. + std::thread compute_th; // manages compute_pool (exceptions, stopping, etc.) + std::thread writer_th; // constantly writing tensor data from buf_write to the output stream IN ORDER. + // init scheduler(const int32_t _n_threads, std::vector _tsd_vec): n_threads(_n_threads), @@ -229,9 +233,9 @@ struct scheduler { } void run() { - // TODO: start `read_worker` thread - // TODO: start `compute_worker` thread (?) - // TODO: start `write_worker` thread + // TODO: start `reader_th` thread + // TODO: start `compute_th` thread + // TODO: start `writer_th` thread // throw std::runtime_error if something fails } From 7d93875604eab61c919dac13e813a4fb0dcadbb6 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 9 Mar 2026 21:30:36 -0500 Subject: [PATCH 19/23] WIP --- src/CMakeLists.txt | 1 + src/llama-quant-scheduler.cpp | 60 ++++++++++++++++++++++++----------- 2 files changed, 43 insertions(+), 18 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 283823fa9c8..a98ed4e1cd0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,6 +31,7 @@ add_library(llama llama-model-saver.cpp llama-model.cpp llama-quant.cpp + llama-quant-scheduler.cpp llama-sampler.cpp llama-vocab.cpp unicode-data.cpp diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 4093f1d500f..5b44da5ea59 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -77,16 +77,6 @@ #include #include -// determine the dimension along which we can divide this tensor into `n` equally-sized chunks. -// return 0, 1, 2, or 3. if none are divisible, return -1. -static int get_split_dim(const tensor_sched_data & tsd, const int64_t n) { - if (tsd.ne0 > n && tsd.ne0 % n == 0) return 0; - if (tsd.ne1 > n && tsd.ne1 % n == 0) return 1; - if (tsd.ne2 > n && tsd.ne2 % n == 0) return 2; - if (tsd.ne3 > n && tsd.ne3 % n == 0) return 3; - return -1; -} - template struct sched_buffer { static_assert(std::is_same_v || std::is_same_v, "sched_buffer only supports uint8_t and float"); @@ -98,9 +88,7 @@ template struct sched_buffer { std::condition_variable cv; // init but don't allocate the buffer yet - sched_buffer(): - has_data(false), idx(-1) - {} + sched_buffer(): has_data(false), idx(-1) {} // allocate the buffer and return the allocated size in bytes size_t allocate(const size_t _size) { @@ -109,11 +97,11 @@ template struct sched_buffer { } // signal to workers that this buffer now has data for tensor at index `_idx`. - // this updates the buffer's `idx` to match. all indices must be sequential. + // this updates the buffer's `idx` to match. indices must be sequential. void signal_has_data(const int64_t _idx) { { std::lock_guard lock(mtx); - GGML_ASSERT(_idx == idx + 1 && "buffer tensor indices must be sequential"); + GGML_ASSERT(_idx == idx + 1 && "tensor buffer indices must be sequential"); has_data = true; idx = _idx; } @@ -136,6 +124,42 @@ template struct sched_buffer { } }; +struct read_worker { + std::thread thread; + const sched_buffer & buf; + + read_worker(const sched_buffer & _buf): buf(_buf) { + // TODO: init? + }; + + ~read_worker() { + // TODO: safe stoppage + destruction of thread + } +}; + +struct write_worker { + std::thread thread; + const sched_buffer & buf; + + write_worker(const sched_buffer & _buf): buf(_buf) { + // TODO: init? + }; + + ~write_worker() { + // TODO: safe stoppage + destruction of thread + } +}; + +// determine the dimension along which we can divide this tensor into `n` equally-sized chunks. +// return 0, 1, 2, or 3. if none are divisible, return -1. +static int get_split_dim(const std::vector & ne, const int64_t n) { + if (ne[0] > n && ne[0] % n == 0) return 0; + if (ne[1] > n && ne[1] % n == 0) return 1; + if (ne[2] > n && ne[2] % n == 0) return 2; + if (ne[3] > n && ne[3] % n == 0) return 3; + return -1; +} + // pool of worker threads used for dequantization + quantization struct compute_pool { const int32_t n_threads; @@ -192,9 +216,9 @@ struct scheduler { compute_pool pool; - std::thread reader_th; // constantly reading tensor data from the original model into buf_read. - std::thread compute_th; // manages compute_pool (exceptions, stopping, etc.) - std::thread writer_th; // constantly writing tensor data from buf_write to the output stream IN ORDER. + // std::thread reader_th; // constantly reading tensor data from the original model into buf_read. + // std::thread compute_th; // manages compute_pool (exceptions, stopping, etc.) + // std::thread writer_th; // constantly writing tensor data from buf_write to the output stream IN ORDER. // init scheduler(const int32_t _n_threads, std::vector _tsd_vec): From 8368609d0ead4011208630347809a25d830ee9c2 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 9 Mar 2026 22:00:22 -0500 Subject: [PATCH 20/23] WIP --- src/llama-quant.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-quant.h b/src/llama-quant.h index 0604ea63bdb..ea3bc67d914 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,5 +1,4 @@ #pragma once -#include // per-tensor info needed by the quantization work scheduler. // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise. From f5c0c38ee2a1f72dd8959a41ee481a0d781d3f4b Mon Sep 17 00:00:00 2001 From: ddh0 Date: Mon, 9 Mar 2026 22:10:19 -0500 Subject: [PATCH 21/23] WIP --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index d82cdc5fb67..55943370bd8 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -180,7 +180,7 @@ struct quantize_state_impl { bool has_imatrix = false; // used to figure out if a model has tied embeddings (tok_embd shares weights with output) - bool has_tied_embeddings = false; // assume tied until we see output.weight + bool has_tied_embeddings = true; // assume tied until we see output.weight // tensor type override patterns (compiled once, used twice) std::vector> tensor_type_patterns; From 3db73fcb83d51197cdf5ec348703b8e9496ef728 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 10 Mar 2026 02:19:42 -0500 Subject: [PATCH 22/23] WIP --- src/llama-quant-scheduler.cpp | 9 ++------- src/llama-quant.h | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/llama-quant-scheduler.cpp b/src/llama-quant-scheduler.cpp index 5b44da5ea59..19c566149a6 100644 --- a/src/llama-quant-scheduler.cpp +++ b/src/llama-quant-scheduler.cpp @@ -256,12 +256,7 @@ struct scheduler { LLAMA_LOG_DEBUG("%8.2f MiB\n", max_dst_sz/1024.0/1024.0); } - void run() { - // TODO: start `reader_th` thread - // TODO: start `compute_th` thread - // TODO: start `writer_th` thread - // throw std::runtime_error if something fails - } - ~scheduler() = default; }; + +void scheduler::run() {}; \ No newline at end of file diff --git a/src/llama-quant.h b/src/llama-quant.h index ea3bc67d914..c340a7e3845 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -3,12 +3,19 @@ // per-tensor info needed by the quantization work scheduler. // constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise. struct tensor_sched_data { - const ggml_type src_type; // source tensor type - const ggml_type dst_type; // destination tensor type - const int64_t ne0; // n_cols - const int64_t ne1; // n_rows - const int64_t ne2; // n_expert (or any 3rd tensor dimension) - const int64_t ne3; // any 4th tensor dimension (currently unused, always 1) - const void * const src_data; // pointer to raw source tensor data buffer, read-only - const void * const imatrix; // pointer to imatrix data, or nullptr, read-only + const std::vector ne; + const ggml_type src_type; + const ggml_type dst_type; + const void * src_data; // pointer to raw source tensor data buffer, read-only + const void * imatrix; // pointer to imatrix data, or nullptr, read-only + tensor_sched_data( + const std::vector _ne, + const ggml_type _src_type, + const ggml_type _dst_type, + const void * _src_data, + const void * _imatrix + ) + { + + } }; From 16bef330b9d0f1a53051263765702ce47f4302bb Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 25 Mar 2026 02:59:31 -0500 Subject: [PATCH 23/23] begin to adopt command pattern --- src/llama-quant.h | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/llama-quant.h b/src/llama-quant.h index c340a7e3845..d1c4cc9fdd6 100644 --- a/src/llama-quant.h +++ b/src/llama-quant.h @@ -1,21 +1,25 @@ #pragma once -// per-tensor info needed by the quantization work scheduler. -// constructed in llama-quant.cpp, passed to llama-quant-scheduler.cpp, not used otherwise. -struct tensor_sched_data { - const std::vector ne; - const ggml_type src_type; - const ggml_type dst_type; - const void * src_data; // pointer to raw source tensor data buffer, read-only - const void * imatrix; // pointer to imatrix data, or nullptr, read-only - tensor_sched_data( - const std::vector _ne, - const ggml_type _src_type, - const ggml_type _dst_type, - const void * _src_data, - const void * _imatrix - ) - { - - } +enum sched_cmd_status { + CMD_STATUS_PENDING, + CMD_STATUS_IN_PROGRESS, + CMD_STATUS_COMPLETE, + CMD_STATUS_COUNT, // always last +}; + +// types of operations that performed the quantization work scheduler +enum sched_cmd_type { + CMD_TYPE_READ, + CMD_TYPE_DEQUANTIZE, + CMD_TYPE_QUANTIZE, + CMD_TYPE_WRITE, + CMD_TYPE_COUNT // always last +}; + +// unit of work for the quantization work scheduler (command pattern) +struct sched_cmd { + const ggml_tensor * tensor; + const enum sched_cmd_type sched_cmd_type; + + std::atomic sched_cmd_status; };