diff --git a/Q4_HIFI_ROADMAP.md b/Q4_K_HIFI_ROADMAP.md similarity index 83% rename from Q4_HIFI_ROADMAP.md rename to Q4_K_HIFI_ROADMAP.md index ee6d8c329dc..72bface8375 100644 --- a/Q4_HIFI_ROADMAP.md +++ b/Q4_K_HIFI_ROADMAP.md @@ -13,7 +13,7 @@ Geoff Munn​ | Finding | Strategic Implication | |--------|------------------------| | ✅ **Q3_HIFI excels on ≤2B models** | Outlier preservation + Q3_K base = optimal for small models | -| ❌ **Q4_HIFI fails on ≥4B models** | Sparse outliers can’t fix aggressive 4-bit base quantization | +| ❌ **Q4_K_HIFI fails on ≥4B models** | Sparse outliers can't fix aggressive 4-bit base quantization | | ✅ **Q4_K_M wins via Q6_K on key tensors** | Uniform higher precision > sparse outliers at scale | | ✅ **Early layers & embeddings matter most** | Precision should focus on `attn_v`, `ffn_gate`, `token_embd` | | ✅ **Domain-mixed imatrix is essential** | 60% Wikitext, 25% Code, 15% Math for balanced outlier selection | @@ -25,8 +25,8 @@ Geoff Munn​ | Format | Model Size | Strategy | Base Precision | Enhancement | |--------|------------|----------|----------------|-------------| | **Q3_HIFI** | **≤2B** | Outlier preservation | Q3_K | 8 FP16 outliers on early layers | -| **Q4_HIFI_M** | **3–10B** | Smart Q5_K allocation | Q4_K + Q5_K | Q5_K on sensitive tensors | -| **Q4_HIFI_L** | **>10B** | Q4_K_M + precision refinement | Q4_K + Q6_K | 6 FP16 outliers on Q6_K tensors | +| **Q4_K_HIFI_M** | **3–10B** | Smart Q5_K allocation | Q4_K + Q5_K | Q5_K on sensitive tensors | +| **Q4_K_HIFI_L** | **>10B** | Q4_K_M + precision refinement | Q4_K + Q6_K | 6 FP16 outliers on Q6_K tensors | --- @@ -53,7 +53,7 @@ static bool is_q3_hifi_tensor(const char* name, int layer_idx) { --- -## 🚀 **Phase 2: Q4_HIFI_M — Smart Q5_K Allocation (3–10B Models)** +## 🚀 **Phase 2: Q4_K_HIFI_M — Smart Q5_K Allocation (3–10B Models)** ### 🎯 **Objective**: Beat Q4_K_M by **replacing Q4_K with Q5_K on sensitive tensors**. @@ -81,7 +81,7 @@ static ggml_type get_q4_hifi_m_tensor_type(const char* tensor_name) { ``` ### 📊 **Expected Results (Qwen3-4B)** -| Metric | Q4_K_M | **Q4_HIFI_M** | +| Metric | Q4_K_M | **Q4_K_HIFI_M** | |--------|--------|---------------| | **PPL** | 14.79 | **14.55–14.65** ✅ | | **Speed** | 200 t/s | **196–198 t/s** ✅ | @@ -89,7 +89,7 @@ static ggml_type get_q4_hifi_m_tensor_type(const char* tensor_name) { --- -## 🚀 **Phase 3: Q4_HIFI_L — Q4_K_M + Strategic Outliers (>10B Models)** +## 🚀 **Phase 3: Q4_K_HIFI_L — Q4_K_M + Strategic Outliers (>10B Models)** ### 🎯 **Objective**: Squeeze extra quality from Q4_K_M on massive models. @@ -116,7 +116,7 @@ static ggml_type get_q4_hifi_l_tensor_type(const char* tensor_name) { ``` ### 📊 **Expected Results (Devstral-123B)** -| Metric | Q4_K_S | **Q4_HIFI_L** | +| Metric | Q4_K_S | **Q4_K_HIFI_L** | |--------|--------|---------------| | **PPL** | 11.24 | **11.10–11.15** ✅ | | **Speed** | 9.75 t/s | **9.65 t/s** ✅ | @@ -152,7 +152,7 @@ void quantize_hifi_family(...) { ./llama-quantize --hifi model-f16.gguf model-hifi.gguf # Manual override -./llama-quantize --quant-type Q4_HIFI_M model-f16.gguf model-hifi-m.gguf +./llama-quantize --quant-type Q4_K_HIFI_M model-f16.gguf model-hifi-m.gguf ``` ### **Step 3: Documentation** @@ -162,8 +162,8 @@ void quantize_hifi_family(...) { | Model Size | Command | Best For | |------------|---------|----------| | ≤2B | `--hifi` | Qwen-0.6B, Phi-3, Gemma-2B | -| 3–10B | `--quant-type Q4_HIFI_M` | Qwen-4B, Llama-3-8B, Mistral-7B | -| >10B | `--quant-type Q4_HIFI_L` | Distrill-123B, Llama-3-70B | +| 3–10B | `--quant-type Q4_K_HIFI_M` | Qwen-4B, Llama-3-8B, Mistral-7B | +| >10B | `--quant-type Q4_K_HIFI_L` | Distrill-123B, Llama-3-70B | ``` --- @@ -174,8 +174,8 @@ void quantize_hifi_family(...) { |-------|-------------|-----|-------|------| | **Qwen3-0.6B** | **Q3_HIFI** | **23.42** | 593 t/s | 469 MiB | | **Qwen3-1.7B** | **Q3_HIFI** | **17.96** | 385 t/s | 1.22 GiB | -| **Qwen3-4B** | **Q4_HIFI_M** | **14.60** | 197 t/s | 2.36 GiB | -| **Devstral-123B** | **Q4_HIFI_L** | **11.12** | 9.65 t/s | 66.7 GiB | +| **Qwen3-4B** | **Q4_K_HIFI_M** | **14.60** | 197 t/s | 2.36 GiB | +| **Devstral-123B** | **Q4_K_HIFI_L** | **11.12** | 9.65 t/s | 66.7 GiB | --- @@ -184,7 +184,7 @@ void quantize_hifi_family(...) { 1. **No more forcing one format to scale** — each size gets its optimal strategy 2. **Builds on proven wins** — Q3_HIFI works, Q4_K_M works, now combine intelligently 3. **Minimal complexity** — no residual quantization, no INT8 experiments -4. **Clear user guidance** — “Use HIFI, we’ll pick the right variant” +4. **Clear user guidance** — "Use HIFI, we'll pick the right variant" --- @@ -193,13 +193,14 @@ void quantize_hifi_family(...) { | Phase | Task | Timeline | |-------|------|----------| | **1** | Q3_HIFI revival (reset + validate) | 3 days | -| **2** | Q4_HIFI_M implementation | 3 days | -| **3** | Q4_HIFI_L implementation | 4 days | +| **2** | Q4_K_HIFI_M implementation | 3 days | +| **3** | Q4_K_HIFI_L implementation | 4 days | | **4** | Unified CLI + documentation | 2 days | | **5** | Upstream PR preparation | 2 days | --- -This roadmap **honors your discoveries** while **avoiding known pitfalls**. You’re not starting over — you’re **focusing your proven strengths** where they matter most. +This roadmap **honors your discoveries** while **avoiding known pitfalls**. You're not starting over — you're **focusing your proven strengths** where they matter most. + +**The HIFI family will be the first quantization approach that truly adapts to model scale — delivering optimal quality, speed, and size at every level.** -**The HIFI family will be the first quantization approach that truly adapts to model scale — delivering optimal quality, speed, and size at every level.** \ No newline at end of file diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index cf3649130be..9a033e87f13 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -429,7 +429,8 @@ extern "C" { GGML_TYPE_Q6_K_HIFI = 41, // Q6_K_HIFI: Q6_K layout + 4 FP16 outliers for critical tensors GGML_TYPE_Q6_K_HIFI_DYNAMIC = 42, // Q6_K_HIFI_DYNAMIC: Q6_K + 2-8 outliers based on layer sensitivity GGML_TYPE_Q6_K_HIFI_RES8 = 43, // Q6_K_HIFI_RES8: Q6_K + INT8 residuals (compact format) - GGML_TYPE_COUNT = 44, + GGML_TYPE_Q5_K_HIFI_RES8 = 44, // Q5_K_HIFI_RES8: Q5_K + INT8 residuals (efficient for 4B-10B models) + GGML_TYPE_COUNT = 45, }; // precision diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index 14bcfe0e0ee..3d78cf9c0c1 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -415,6 +415,34 @@ typedef struct { // Total: 232 bytes (210 + 22) - saves 4 bytes/block vs Q6_K_HIFI_DYNAMIC static_assert(sizeof(block_q6_k_hifi_res8) == 232, "wrong q6_k_hifi_res8 block size/padding"); +// Q5_K_HIFI_RES8: Efficient Q5_K with INT8 residuals for 4B-10B models +// This format is optimized for mid-scale models where Q6_K overhead is wasteful. +// Q5_K base provides sufficient precision, outliers compensate for 1-bit loss. +// Size: 200 bytes vs Q6_K_HIFI_RES8's 232 bytes (~14% smaller) +// Expected results: matches Q6_K_HIFI_RES8 quality at better BPW efficiency +#define Q5_K_HIFI_RES8_MAX_OUTLIERS 8 +typedef struct { + // === Q5_K-COMPATIBLE REGION (176 bytes) - DO NOT REORDER === + GGML_EXTENSION union { + struct { + ggml_half d; // super-block scale for quantized scales + ggml_half dmin; // super-block scale for quantized mins + } GGML_COMMON_AGGR_S; + ggml_half2 dm; + } GGML_COMMON_AGGR_U; + uint8_t scales[K_SCALE_SIZE]; // 12 bytes: scales and mins, quantized with 6 bits + uint8_t qh[QK_K/8]; // 32 bytes: quants, high bit + uint8_t qs[QK_K/2]; // 128 bytes: quants, low 4 bits + // === COMPACT INT8 RESIDUAL EXTENSION (24 bytes) === + uint8_t outlier_count; // 1 byte: actual outlier count (1-8) + uint8_t outlier_idx[Q5_K_HIFI_RES8_MAX_OUTLIERS]; // 8 bytes: outlier positions (0-255) + int8_t residual_vals[Q5_K_HIFI_RES8_MAX_OUTLIERS]; // 8 bytes: INT8 residuals (-127 to +127) + uint8_t _padding[3]; // 3 bytes: padding for float alignment + float residual_scale; // 4 bytes: shared scale for residuals +} block_q5_k_hifi_res8; +// Total: 200 bytes (176 + 24) - 14% smaller than Q6_K_HIFI_RES8 +static_assert(sizeof(block_q5_k_hifi_res8) == 200, "wrong q5_k_hifi_res8 block size/padding"); + // This is only used for intermediate quantization and dot products typedef struct { float d; // delta diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 21ed1699e41..636410ac8d9 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -303,6 +303,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_Q5_K_HIFI_RES8] = { + .from_float = quantize_row_q5_k_hifi_res8, // 3-arg wrapper (matches Q6_K_HIFI_RES8 pattern) + .vec_dot = ggml_vec_dot_q5_k_hifi_res8_q8_K, // Efficient Q5_K + INT8 residuals kernel + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_Q4_K] = { .from_float = quantize_row_q4_K, .vec_dot = ggml_vec_dot_q4_K_q8_K, diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index f03e743fc08..8cf01905477 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -676,6 +676,7 @@ void ggml_compute_forward_add( case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -1129,6 +1130,7 @@ void ggml_compute_forward_add1( case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -1261,6 +1263,7 @@ void ggml_compute_forward_acc( case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -4288,6 +4291,7 @@ void ggml_compute_forward_out_prod( case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -4567,6 +4571,7 @@ void ggml_compute_forward_set( case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -4793,6 +4798,7 @@ void ggml_compute_forward_get_rows( case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: @@ -5521,6 +5527,7 @@ void ggml_compute_forward_clamp( case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index 6769caeaaeb..e12d06caff8 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -1019,6 +1019,107 @@ void ggml_vec_dot_q6_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, *s = sumf; } +// Q5_K_HIFI_RES8: Efficient Q5_K base + INT8 residuals for 4B-10B models +// Uses same correction strategy as Q6_K_HIFI_RES8, but with Q5_K base for better BPW +void ggml_vec_dot_q5_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_k_hifi_res8 * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; + + const int nb = n / QK_K; + + static const uint32_t kmask1 = 0x3f3f3f3f; + static const uint32_t kmask2 = 0x0f0f0f0f; + static const uint32_t kmask3 = 0x03030303; + + uint32_t utmp[4]; + const uint8_t * scales = (const uint8_t*)&utmp[0]; + const uint8_t * mins = (const uint8_t*)&utmp[2]; + + int8_t aux8[QK_K]; + int16_t aux16[8]; + float sums [8]; + int32_t aux32[8]; + memset(sums, 0, 8*sizeof(float)); + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + // === Q5_K bulk dot product (same as ggml_vec_dot_q5_K_q8_K_generic) === + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + memset(aux32, 0, 8*sizeof(int32_t)); + int8_t * GGML_RESTRICT a = aux8; + uint8_t m = 1; + for (int j = 0; j < QK_K; j += 64) { + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF) + (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4) + (hm[l] & m ? 16 : 0); + a += 32; m <<= 1; + q4 += 32; + } + memcpy(utmp, x[i].scales, 12); + utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); + const uint32_t uaux = utmp[1] & kmask1; + utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4); + utmp[2] = uaux; + utmp[0] &= kmask1; + + int sumi = 0; + for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2]; + a = aux8; + int is = 0; + for (int j = 0; j < QK_K/32; ++j) { + int32_t scale = scales[is++]; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l]; + for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l]; + q8 += 8; a += 8; + } + const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d; + for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l]; + const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d; + sumf -= dmin * sumi; + + // === INT8 RESIDUAL CORRECTION === + // Add residual * activation corrections at outlier positions + const int outlier_count = x[i].outlier_count; + const float res_scale = x[i].residual_scale; + const float d8 = y[i].d; + const float scale_factor = res_scale * (1.0f / 127.0f) * d8; + for (int k = 0; k < outlier_count; ++k) { + const int idx = x[i].outlier_idx[k]; + const int8_t activation = y[i].qs[idx]; + // Early exit: skip if activation is too small (same threshold as Q6_K_HIFI) + if (activation > 4 || activation < -4) { + const float residual = x[i].residual_vals[k] * scale_factor; + sumf += residual * activation; + } + } + } + for (int l = 0; l < 8; ++l) sumf += sums[l]; + *s = sumf; +} + +// Wrapper for quantize_row_q5_k_hifi_res8 (simple version) +void quantize_row_q5_k_hifi_res8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q5_k_hifi_res8_ref(x, (block_q5_k_hifi_res8 *)y, k); +} + void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { assert(n % QK_K == 0); assert(nrc == 1); diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index 0bd5b741cb9..76548c4caf6 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -30,6 +30,8 @@ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q6_k_hifi(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q6_k_hifi_dynamic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q6_k_hifi_res8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_k_hifi_res8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +size_t quantize_q5_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -56,6 +58,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q6_k_hifi_dynamic_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q6_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q5_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index d2732f0d330..a8a492394ab 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -853,6 +853,13 @@ struct ggml_cuda_type_traits { static constexpr int qi = QI6_K; }; +template<> +struct ggml_cuda_type_traits { + static constexpr int qk = QK_K; + static constexpr int qr = QR5_K; + static constexpr int qi = QI5_K; +}; + template<> struct ggml_cuda_type_traits { static constexpr int qk = QK_K; diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu index 9c15a411e44..4f17fed8c52 100644 --- a/ggml/src/ggml-cuda/convert.cu +++ b/ggml/src/ggml-cuda/convert.cu @@ -407,6 +407,56 @@ static __global__ void dequantize_block_q6_k_hifi_res8(const void * __restrict__ } } +// Q5_K_HIFI_RES8: Efficient Q5_K base with INT8 residuals for 4B-10B models +template +static __global__ void dequantize_block_q5_k_hifi_res8(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const block_q5_k_hifi_res8 * x = (const block_q5_k_hifi_res8 *) vx; + + const int64_t i = blockIdx.x; + + // Q5_K bulk dequantization (same as dequantize_block_q5_K) + const int64_t tid = threadIdx.x; + const int64_t il = tid/16; // il is in 0...3 + const int64_t ir = tid%16; // ir is in 0...15 + const int64_t is = 2*il; // is is in 0...6 + + dst_t * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; + + // Thread 0 handles INT8 residual corrections + __syncthreads(); + if (threadIdx.x == 0) { + dst_t * yb = yy + i*QK_K; + const int outlier_count = x[i].outlier_count; + const float res_scale = x[i].residual_scale; + const float scale_factor = res_scale * (1.0f / 127.0f); + // Add residual corrections at outlier positions + for (int k = 0; k < outlier_count && k < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k) { + const int idx = x[i].outlier_idx[k]; + const float residual = x[i].residual_vals[k] * scale_factor; + yb[idx] += residual; + } + } +} + template static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -743,6 +793,12 @@ static void dequantize_row_q6_k_hifi_res8_cuda(const void * vx, dst_t * y, const dequantize_block_q6_k_hifi_res8<<>>(vx, y); } +template +static void dequantize_row_q5_k_hifi_res8_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_q5_k_hifi_res8<<>>(vx, y); +} + template static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { const int nb = k / QK_K; @@ -876,6 +932,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_q6_k_hifi_dynamic_cuda; case GGML_TYPE_Q6_K_HIFI_RES8: return dequantize_row_q6_k_hifi_res8_cuda; + case GGML_TYPE_Q5_K_HIFI_RES8: + return dequantize_row_q5_k_hifi_res8_cuda; case GGML_TYPE_Q4_K: return dequantize_row_q4_K_cuda; case GGML_TYPE_Q5_K: @@ -935,6 +993,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_q6_k_hifi_dynamic_cuda; case GGML_TYPE_Q6_K_HIFI_RES8: return dequantize_row_q6_k_hifi_res8_cuda; + case GGML_TYPE_Q5_K_HIFI_RES8: + return dequantize_row_q5_k_hifi_res8_cuda; case GGML_TYPE_Q4_K: return dequantize_row_q4_K_cuda; case GGML_TYPE_Q5_K: diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index af00aee2ea7..06e1816f3fa 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -4386,6 +4386,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_TYPE_Q6_K_HIFI: case GGML_TYPE_Q6_K_HIFI_DYNAMIC: case GGML_TYPE_Q6_K_HIFI_RES8: + case GGML_TYPE_Q5_K_HIFI_RES8: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 5a0d6c9e439..5dd8318604b 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -21,6 +21,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) case GGML_TYPE_Q6_K_HIFI: return vec_dot_q6_K_q8_1; // Reuse Q6_K kernel case GGML_TYPE_Q6_K_HIFI_DYNAMIC: return vec_dot_q6_K_q8_1; // Reuse Q6_K kernel case GGML_TYPE_Q6_K_HIFI_RES8: return vec_dot_q6_k_hifi_res8_q8_1; // HIFI kernel with residual corrections + case GGML_TYPE_Q5_K_HIFI_RES8: return vec_dot_q5_k_hifi_res8_q8_1; // HIFI kernel with residual corrections case GGML_TYPE_Q4_K: return vec_dot_q4_K_q8_1; case GGML_TYPE_Q5_K: return vec_dot_q5_K_q8_1; case GGML_TYPE_Q6_K: return vec_dot_q6_K_q8_1; @@ -51,6 +52,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) { case GGML_TYPE_Q6_K_HIFI: return VDR_Q6_K_Q8_1_MMVQ; // Same as Q6_K case GGML_TYPE_Q6_K_HIFI_DYNAMIC: return VDR_Q6_K_Q8_1_MMVQ; // Same as Q6_K case GGML_TYPE_Q6_K_HIFI_RES8: return VDR_Q6_K_Q8_1_MMVQ; // Same as Q6_K + case GGML_TYPE_Q5_K_HIFI_RES8: return VDR_Q5_K_Q8_1_MMVQ; // Same as Q5_K case GGML_TYPE_Q4_K: return VDR_Q4_K_Q8_1_MMVQ; case GGML_TYPE_Q5_K: return VDR_Q5_K_Q8_1_MMVQ; case GGML_TYPE_Q6_K: return VDR_Q6_K_Q8_1_MMVQ; @@ -574,6 +576,12 @@ static void mul_mat_vec_q_switch_type( nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); break; + case GGML_TYPE_Q5_K_HIFI_RES8: + mul_mat_vec_q_switch_ncols_dst // Q5_K HIFI with residual corrections + (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, + nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + break; case GGML_TYPE_IQ2_XXS: mul_mat_vec_q_switch_ncols_dst (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh index 103ed3e802a..6b1548da982 100644 --- a/ggml/src/ggml-cuda/vecdotq.cuh +++ b/ggml/src/ggml-cuda/vecdotq.cuh @@ -1022,6 +1022,80 @@ static __device__ __forceinline__ float vec_dot_q6_k_hifi_res8_q8_1( return sum; } +// Q5_K_HIFI_RES8: Q5_K layout + INT8 residuals + per-block scale +// Efficient format for 4B-10B models with Q5_K base (176 bytes vs Q6_K's 210) +#define VDR_Q5_K_HIFI_RES8_Q8_1_MMVQ VDR_Q5_K_Q8_1_MMVQ + +static __device__ __forceinline__ float vec_dot_q5_k_hifi_res8_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) { + + const block_q5_k_hifi_res8 * bq5_hifi = (const block_q5_k_hifi_res8 *) vbq + kbx; + + // === Q5_K bulk dot product (same as vec_dot_q5_K_q8_1) === + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_hifi->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_hifi->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_hifi->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = __low2float(bq8i->ds); + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + float sum = vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_hifi->dm, d8); + + // === INT8 RESIDUAL CORRECTION === + const int outlier_count = bq5_hifi->outlier_count; + + if (outlier_count > 0) { + const float res_scale = bq5_hifi->residual_scale * (1.0f / 127.0f); + + // Only thread 0 in the warp group for this block computes the residual correction + if (iqs == 0) { + for (int k = 0; k < outlier_count && k < 8; ++k) { + const int idx = bq5_hifi->outlier_idx[k]; + const int idx_bq8 = idx / QK8_1; + const int idx_in_bq8 = idx % QK8_1; + + const int8_t q8_val = ((const int8_t*)bq8_1[idx_bq8].qs)[idx_in_bq8]; + const float d8_val = __low2float(bq8_1[idx_bq8].ds); + const float residual = res_scale * bq5_hifi->residual_vals[k]; + sum += residual * q8_val * d8_val; + } + } + } + + return sum; +} + #define VDR_IQ2_XXS_Q8_1_MMVQ 2 #define VDR_IQ2_XXS_Q8_1_MMQ 2 diff --git a/ggml/src/ggml-quants-hifi.c b/ggml/src/ggml-quants-hifi.c index fc9878acaa3..54f01f727ca 100644 --- a/ggml/src/ggml-quants-hifi.c +++ b/ggml/src/ggml-quants-hifi.c @@ -1,5 +1,5 @@ // GGML HIFI Quantization Context Implementation -// Layer-adaptive outlier allocation for Q4_HIFI quantization +// Layer-adaptive outlier allocation for Q4_K_HIFI quantization #include "ggml-quants-hifi.h" #include @@ -151,3 +151,90 @@ float ggml_hifi_compute_tensor_importance( return importance; } +// Strategy 1: Compute per-block importance from imatrix data +// Uses coefficient of variation within the block as the importance metric +float ggml_hifi_compute_block_importance( + const float * imatrix_block, + int block_size +) { + if (imatrix_block == NULL || block_size <= 0) { + return 0.5f; // Default to medium importance + } + + // Compute statistics for this block + double sum = 0.0; + double sum_sq = 0.0; + double max_val = 0.0; + + for (int i = 0; i < block_size; ++i) { + double val = (double)imatrix_block[i]; + sum += val; + sum_sq += val * val; + if (val > max_val) max_val = val; + } + + double mean = sum / (double)block_size; + if (mean < 1e-10) { + return 0.3f; // Low importance for near-zero blocks + } + + double mean_sq = sum_sq / (double)block_size; + double variance = mean_sq - mean * mean; + if (variance < 0) variance = 0; + + // Coefficient of variation (CV) + double stddev = sqrt(variance); + double cv = stddev / mean; + + // Also consider the max/mean ratio (spikiness) + double spikiness = max_val / mean; + + // Combine CV and spikiness for final importance + // High CV = high variance = some weights are outliers = need more outliers + // High spikiness = extreme values present = need more outliers + double combined = 0.6 * cv + 0.4 * (spikiness / 10.0); // spikiness typically 1-20 + + // Normalize to 0.2 - 0.9 range + float importance = 0.2f + 0.7f * (float)(combined / 2.0); // combined typically 0-3 + if (importance > 0.9f) importance = 0.9f; + if (importance < 0.2f) importance = 0.2f; + + return importance; +} + +// Strategy 1: Compute per-block outlier count based on local imatrix variance +// Adjusts the base outlier count up or down based on block importance +int ggml_hifi_compute_block_outlier_count( + float block_importance, + int base_outlier_count, + float model_params_b +) { + // Scale factor based on block importance + // High importance (>0.7): boost outliers up to 1.5x + // Low importance (<0.3): reduce outliers down to 0.5x + // Medium importance: keep base count + float scale = 1.0f; + + if (block_importance > 0.7f) { + // High importance block - boost outliers + scale = 1.0f + 0.5f * (block_importance - 0.7f) / 0.3f; // 1.0 to 1.5 + } else if (block_importance < 0.3f) { + // Low importance block - reduce outliers + scale = 0.5f + 0.5f * (block_importance / 0.3f); // 0.5 to 1.0 + } + + // For larger models, be more aggressive with reduction on low-importance blocks + if (model_params_b >= 7.0f && block_importance < 0.4f) { + scale *= 0.8f; // Additional 20% reduction for large models + } + + int adjusted_count = (int)roundf((float)base_outlier_count * scale); + + // Clamp to valid range [1, 8] + // Allow minimum of 1 for low-importance blocks (save more space) + if (adjusted_count < 1) adjusted_count = 1; + if (adjusted_count > 8) adjusted_count = 8; + + return adjusted_count; +} + diff --git a/ggml/src/ggml-quants-hifi.h b/ggml/src/ggml-quants-hifi.h index 919bbcca728..89a0b8ba823 100644 --- a/ggml/src/ggml-quants-hifi.h +++ b/ggml/src/ggml-quants-hifi.h @@ -1,5 +1,5 @@ // GGML HIFI Quantization Context -// Provides layer-adaptive outlier allocation for Q4_HIFI quantization +// Provides layer-adaptive outlier allocation for Q4_K_HIFI quantization // // This header defines the context infrastructure for passing layer-specific // parameters to the quantization functions without modifying the core GGML API. @@ -20,6 +20,12 @@ extern "C" { #define Q6_K_HIFI_RES8_MAX_OUTLIERS 8 #endif +// Maximum outliers per block for Q5_K_HIFI_RES8 format +// Must match the value in ggml-common.h +#ifndef Q5_K_HIFI_RES8_MAX_OUTLIERS +#define Q5_K_HIFI_RES8_MAX_OUTLIERS 8 +#endif + // Layer-adaptive quantization context // Used to pass dynamic parameters to Q6_K_HIFI_RES8 quantization typedef struct { @@ -63,6 +69,30 @@ GGML_API float ggml_hifi_compute_tensor_importance( int64_t n_elements ); +// Strategy 1: Compute per-block importance from imatrix data +// Used for adaptive per-block outlier allocation +// Parameters: +// imatrix_block: Per-element importance weights for this block (QK_K elements) +// block_size: Number of elements in the block (typically QK_K = 256) +// Returns: Block importance score (0.0-1.0) +GGML_API float ggml_hifi_compute_block_importance( + const float * imatrix_block, + int block_size +); + +// Strategy 1: Compute per-block outlier count based on local imatrix variance +// High variance blocks get more outliers, low variance blocks get fewer +// Parameters: +// block_importance: Importance score for this block (0.0-1.0) +// base_outlier_count: Base outlier count from tensor-level computation +// model_params_b: Model size in billions +// Returns: Adjusted outlier count for this block (2-8) +GGML_API int ggml_hifi_compute_block_outlier_count( + float block_importance, + int base_outlier_count, + float model_params_b +); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 4eec5c6a6e7..43cf95f7b60 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -2441,19 +2441,35 @@ void quantize_row_q6_k_hifi_res8_ref(const float * GGML_RESTRICT x, block_q6_k_h quantize_row_q6_k_hifi_res8_ref_ex(x, y, k, Q6_K_HIFI_RES8_MAX_OUTLIERS); } -// imatrix-aware quantization implementation -static void quantize_row_q6_k_hifi_res8_impl(const float * GGML_RESTRICT x, block_q6_k_hifi_res8 * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights, int outlier_count) { +// imatrix-aware quantization implementation with per-block adaptive outliers (Strategy 1) +static void quantize_row_q6_k_hifi_res8_impl(const float * GGML_RESTRICT x, block_q6_k_hifi_res8 * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights, int base_outlier_count) { assert(k % QK_K == 0); const int64_t nb = k / QK_K; - if (outlier_count < 1) outlier_count = 1; - if (outlier_count > Q6_K_HIFI_RES8_MAX_OUTLIERS) outlier_count = Q6_K_HIFI_RES8_MAX_OUTLIERS; + if (base_outlier_count < 1) base_outlier_count = 1; + if (base_outlier_count > Q6_K_HIFI_RES8_MAX_OUTLIERS) base_outlier_count = Q6_K_HIFI_RES8_MAX_OUTLIERS; + + // Get model size from HIFI context for per-block adaptation + float model_params_b = 1.0f; // Default to 1B for Q6_K (small models) + const ggml_hifi_quant_context * hifi_ctx = ggml_hifi_get_context(); + if (hifi_ctx && hifi_ctx->is_active) { + model_params_b = hifi_ctx->model_params_b; + } for (int64_t ib = 0; ib < nb; ++ib) { const float * xb = x + ib * QK_K; const float * qw = quant_weights ? quant_weights + ib * QK_K : NULL; block_q6_k_hifi_res8 * block = &y[ib]; + // Strategy 1: Compute per-block adaptive outlier count based on local imatrix variance + int outlier_count = base_outlier_count; + if (qw != NULL) { + // Compute block importance from local imatrix data + float block_importance = ggml_hifi_compute_block_importance(qw, QK_K); + // Adjust outlier count based on block importance + outlier_count = ggml_hifi_compute_block_outlier_count(block_importance, base_outlier_count, model_params_b); + } + block->outlier_count = (uint8_t)outlier_count; block->_padding = 0; @@ -2577,6 +2593,240 @@ size_t quantize_q6_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_REST return nrow * row_size; } +// ===================================================================== +// Q5_K_HIFI_RES8: Efficient Q5_K with INT8 residuals for 4B-10B models +// Uses Q5_K base (176 bytes) instead of Q6_K (210 bytes) for better BPW +// ===================================================================== + +// Extended quantization function with explicit outlier count +void quantize_row_q5_k_hifi_res8_ref_ex(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k, int outlier_count) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + // Clamp outlier count to valid range + if (outlier_count < 1) outlier_count = 1; + if (outlier_count > Q5_K_HIFI_RES8_MAX_OUTLIERS) outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS; + + for (int64_t ib = 0; ib < nb; ++ib) { + const float * xb = x + ib * QK_K; + block_q5_k_hifi_res8 * block = &y[ib]; + + // Initialize extension fields + block->outlier_count = (uint8_t)outlier_count; + memset(block->_padding, 0, sizeof(block->_padding)); + + // Step 1: Find top-k outliers by magnitude + float mag[QK_K]; + for (int i = 0; i < QK_K; ++i) { + mag[i] = fabsf(xb[i]); + } + + // Simple selection sort for top-k (k <= 8, so O(n*k) is fine) + int outlier_indices[Q5_K_HIFI_RES8_MAX_OUTLIERS]; + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + int max_idx = 0; + float max_val = mag[0]; + for (int i = 1; i < QK_K; ++i) { + if (mag[i] > max_val) { + max_val = mag[i]; + max_idx = i; + } + } + outlier_indices[k_idx] = max_idx; + mag[max_idx] = -1.0f; // Mark as used + } + + // Step 2: Zero outliers temporarily and quantize as Q5_K + float tmp[QK_K]; + memcpy(tmp, xb, QK_K * sizeof(float)); + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + tmp[outlier_indices[k_idx]] = 0.0f; + } + + // Quantize the Q5_K base (this fills dm, scales, qh, qs) + quantize_row_q5_K_ref(tmp, (block_q5_K *)block, QK_K); + + // Step 3: Compute residuals from Q5_K reconstruction + float dequant[QK_K]; + dequantize_row_q5_K((const block_q5_K *)block, dequant, QK_K); + + float max_residual = 0.0f; + float residuals[Q5_K_HIFI_RES8_MAX_OUTLIERS]; + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + const int idx = outlier_indices[k_idx]; + residuals[k_idx] = xb[idx] - dequant[idx]; + if (fabsf(residuals[k_idx]) > max_residual) { + max_residual = fabsf(residuals[k_idx]); + } + } + + // Handle zero case + if (max_residual == 0.0f) max_residual = 1e-8f; + block->residual_scale = max_residual; + + // Step 4: Store indices and INT8-quantized residuals + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + block->outlier_idx[k_idx] = (uint8_t)outlier_indices[k_idx]; + float norm_res = residuals[k_idx] / max_residual; + block->residual_vals[k_idx] = (int8_t)roundf(norm_res * 127.0f); + } + // Zero-fill remaining slots + for (int k_idx = outlier_count; k_idx < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k_idx) { + block->outlier_idx[k_idx] = 0; + block->residual_vals[k_idx] = 0; + } + } +} + +// 3-argument wrapper for ggml_from_float_t compatibility +void quantize_row_q5_k_hifi_res8_ref(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k) { + quantize_row_q5_k_hifi_res8_ref_ex(x, y, k, Q5_K_HIFI_RES8_MAX_OUTLIERS); +} + +// imatrix-aware quantization implementation with per-block adaptive outliers (Strategy 1) +static void quantize_row_q5_k_hifi_res8_impl(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights, int base_outlier_count) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + if (base_outlier_count < 1) base_outlier_count = 1; + if (base_outlier_count > Q5_K_HIFI_RES8_MAX_OUTLIERS) base_outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS; + + // Get model size from HIFI context for per-block adaptation + float model_params_b = 4.0f; // Default to 4B if no context + const ggml_hifi_quant_context * hifi_ctx = ggml_hifi_get_context(); + if (hifi_ctx && hifi_ctx->is_active) { + model_params_b = hifi_ctx->model_params_b; + } + + for (int64_t ib = 0; ib < nb; ++ib) { + const float * xb = x + ib * QK_K; + const float * qw = quant_weights ? quant_weights + ib * QK_K : NULL; + block_q5_k_hifi_res8 * block = &y[ib]; + + // Strategy 1: Compute per-block adaptive outlier count based on local imatrix variance + int outlier_count = base_outlier_count; + if (qw != NULL) { + // Compute block importance from local imatrix data + float block_importance = ggml_hifi_compute_block_importance(qw, QK_K); + // Adjust outlier count based on block importance + outlier_count = ggml_hifi_compute_block_outlier_count(block_importance, base_outlier_count, model_params_b); + } + + block->outlier_count = (uint8_t)outlier_count; + memset(block->_padding, 0, sizeof(block->_padding)); + + // Find top-k outliers using imatrix-weighted importance + float importance[QK_K]; + for (int i = 0; i < QK_K; ++i) { + float weight = qw ? qw[i] : 1.0f; + importance[i] = fabsf(xb[i]) * weight; + } + + int outlier_indices[Q5_K_HIFI_RES8_MAX_OUTLIERS]; + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + int max_idx = 0; + float max_val = importance[0]; + for (int i = 1; i < QK_K; ++i) { + if (importance[i] > max_val) { + max_val = importance[i]; + max_idx = i; + } + } + outlier_indices[k_idx] = max_idx; + importance[max_idx] = -1.0f; + } + + // Zero outliers and quantize Q5_K base + float tmp[QK_K]; + memcpy(tmp, xb, QK_K * sizeof(float)); + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + tmp[outlier_indices[k_idx]] = 0.0f; + } + quantize_row_q5_K_ref(tmp, (block_q5_K *)block, QK_K); + + // Compute residuals + float dequant[QK_K]; + dequantize_row_q5_K((const block_q5_K *)block, dequant, QK_K); + + float max_residual = 0.0f; + float residuals[Q5_K_HIFI_RES8_MAX_OUTLIERS]; + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + const int idx = outlier_indices[k_idx]; + residuals[k_idx] = xb[idx] - dequant[idx]; + if (fabsf(residuals[k_idx]) > max_residual) { + max_residual = fabsf(residuals[k_idx]); + } + } + + if (max_residual == 0.0f) max_residual = 1e-8f; + block->residual_scale = max_residual; + + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + block->outlier_idx[k_idx] = (uint8_t)outlier_indices[k_idx]; + float norm_res = residuals[k_idx] / max_residual; + block->residual_vals[k_idx] = (int8_t)roundf(norm_res * 127.0f); + } + for (int k_idx = outlier_count; k_idx < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k_idx) { + block->outlier_idx[k_idx] = 0; + block->residual_vals[k_idx] = 0; + } + } +} + +// Dequantization: Q5_K base + INT8 residual corrections +void dequantize_row_q5_k_hifi_res8(const block_q5_k_hifi_res8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + for (int64_t ib = 0; ib < nb; ++ib) { + const block_q5_k_hifi_res8 * block = &x[ib]; + float * yb = y + ib * QK_K; + + // Dequantize Q5_K base + dequantize_row_q5_K((const block_q5_K *)block, yb, QK_K); + + // Add residual corrections at outlier positions + const int outlier_count = block->outlier_count; + const float scale = block->residual_scale; + for (int k_idx = 0; k_idx < outlier_count; ++k_idx) { + const int idx = block->outlier_idx[k_idx]; + const float residual = scale * (block->residual_vals[k_idx] / 127.0f); + yb[idx] += residual; + } + } +} + +// Public quantization function with imatrix support +size_t quantize_q5_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + size_t row_size = ggml_row_size(GGML_TYPE_Q5_K_HIFI_RES8, n_per_row); + + // Get adaptive outlier count from HIFI context if available + int outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS; + const ggml_hifi_quant_context * hifi_ctx = ggml_hifi_get_context(); + if (hifi_ctx && hifi_ctx->is_active) { + outlier_count = hifi_ctx->outlier_count; + if (outlier_count < 1) outlier_count = 1; + if (outlier_count > Q5_K_HIFI_RES8_MAX_OUTLIERS) outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS; + } + + if (!quant_weights) { + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q5_k_hifi_res8_ref_ex(src, (block_q5_k_hifi_res8*)qrow, n_per_row, outlier_count); + src += n_per_row; + qrow += row_size; + } + } else { + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q5_k_hifi_res8_impl(src, (block_q5_k_hifi_res8*)qrow, n_per_row, quant_weights, outlier_count); + src += n_per_row; + qrow += row_size; + } + } + return nrow * row_size; +} + static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) { static_assert(QK4_0 == 32, "QK4_0 must be 32"); @@ -6019,6 +6269,11 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_k_hifi_res8, data, nb); } break; + case GGML_TYPE_Q5_K_HIFI_RES8: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_k_hifi_res8, data, nb); + } break; + case GGML_TYPE_I8: case GGML_TYPE_I16: case GGML_TYPE_I32: diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 5eeea860fcf..bb573278ce3 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -123,6 +123,13 @@ GGML_API void quantize_row_q6_k_hifi_res8_ref_ex(const float * GGML_RESTRICT x, GGML_API void dequantize_row_q6_k_hifi_res8(const block_q6_k_hifi_res8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API size_t quantize_q6_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +// Q5_K_HIFI_RES8: Efficient Q5_K with INT8 residuals for 4B-10B models +// Uses Q5_K base (176 bytes) instead of Q6_K (210 bytes) for better BPW efficiency +GGML_API void quantize_row_q5_k_hifi_res8_ref(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q5_k_hifi_res8_ref_ex(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k, int outlier_count); +GGML_API void dequantize_row_q5_k_hifi_res8(const block_q5_k_hifi_res8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API size_t quantize_q5_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index e4ffc321c2b..b02b4ee6c4d 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -764,6 +764,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q6_k_hifi_res8, .from_float_ref = (ggml_from_float_t) quantize_row_q6_k_hifi_res8_ref, }, + [GGML_TYPE_Q5_K_HIFI_RES8] = { + .type_name = "Q5_K_HIFI_RES8", + .blck_size = QK_K, + .type_size = sizeof(block_q5_k_hifi_res8), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q5_k_hifi_res8, + .from_float_ref = (ggml_from_float_t) quantize_row_q5_k_hifi_res8_ref, + }, [GGML_TYPE_Q4_K] = { .type_name = "q4_K", .blck_size = QK_K, @@ -7573,6 +7581,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_Q6_K_HIFI: result = quantize_q6_k_hifi(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q6_K_HIFI_DYNAMIC: result = quantize_q6_k_hifi_dynamic(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q6_K_HIFI_RES8: result = quantize_q6_k_hifi_res8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q5_K_HIFI_RES8: result = quantize_q5_k_hifi_res8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_F16: { size_t elemsize = sizeof(ggml_fp16_t); diff --git a/include/llama.h b/include/llama.h index 32f4a002b88..aed19226442 100644 --- a/include/llama.h +++ b/include/llama.h @@ -152,8 +152,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors - // Legacy HIFI types (39-43) removed - consolidated into Q4_HIFI (44) - LLAMA_FTYPE_MOSTLY_Q4_HIFI = 44, // Q4_K_M + 2-8 dynamic outliers + early exit (best quality/size ratio) + // Legacy HIFI types (39-43) removed - consolidated into Q4_K_HIFI (44) + LLAMA_FTYPE_MOSTLY_Q4_K_HIFI = 44, // Q4_K_M + 2-8 dynamic outliers + early exit (best quality/size ratio) LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ed87421bfd9..bec7617441a 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -60,7 +60,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; - case LLAMA_FTYPE_MOSTLY_Q4_HIFI: return "Q4_HIFI - ~4.95 bpw (Q4_K_M + INT8 residuals, compact)"; + case LLAMA_FTYPE_MOSTLY_Q4_K_HIFI: return "Q4_K_HIFI - ~4.95 bpw (Q4_K_M + INT8 residuals, compact)"; default: return "unknown, may not work"; } @@ -663,8 +663,9 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; - case GGML_TYPE_Q6_K_HIFI_DYNAMIC: ftype = LLAMA_FTYPE_MOSTLY_Q4_HIFI; break; - case GGML_TYPE_Q6_K_HIFI_RES8: ftype = LLAMA_FTYPE_MOSTLY_Q4_HIFI; break; + case GGML_TYPE_Q6_K_HIFI_DYNAMIC: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_HIFI; break; + case GGML_TYPE_Q6_K_HIFI_RES8: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_HIFI; break; + case GGML_TYPE_Q5_K_HIFI_RES8: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_HIFI; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 79e699576dc..d3dc75e6599 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -48,18 +48,42 @@ static float compute_model_params_b(const llama_hparams & hparams, int64_t n_voc return (float)(attn_params + ffn_params + emb_params) / 1e9f; } +// Get the appropriate HIFI type based on model size +// Q5_K_HIFI_RES8 is more efficient for 4B-10B models (176-byte base vs 210-byte) +// Q6_K_HIFI_RES8 is better for small models where every bit counts +static ggml_type get_hifi_enhanced_type(float model_params_b) { + if (model_params_b <= 2.0f) { + // Small models (≤2B): Q6_K base for maximum quality + return GGML_TYPE_Q6_K_HIFI_RES8; + } else if (model_params_b <= 12.0f) { + // Medium models (4B-10B): Q5_K base for better BPW efficiency + // Q5_K + outliers ≈ Q6_K quality, but 15% smaller + return GGML_TYPE_Q5_K_HIFI_RES8; + } else { + // Large models (>12B): Q5_K for efficiency (diminishing returns from Q6_K) + return GGML_TYPE_Q5_K_HIFI_RES8; + } +} + // Get the percentage of attn_v layers to enhance based on model size // Smaller models benefit more from enhancement, larger models have diminishing returns +// Strategy 3: For very large models (>10B), skip attn_v enhancement entirely +// Only token_embd and output.weight are enhanced (handled separately) static float get_hifi_enhancement_threshold(float model_params_b) { if (model_params_b <= 2.0f) { // Small models (≤2B): enhance 50% of layers - high ROI return 0.50f; - } else if (model_params_b <= 8.0f) { - // Medium models (2-8B): enhance 30% of layers - moderate ROI + } else if (model_params_b <= 5.0f) { + // Medium-small models (2-5B): enhance 30% of layers - moderate ROI return 0.30f; + } else if (model_params_b <= 10.0f) { + // Medium-large models (5-10B): enhance 20% of layers - lower ROI + return 0.20f; } else { - // Large models (>8B): enhance only 15% of layers - diminishing returns - return 0.15f; + // Very large models (>10B): Skip ALL attn_v enhancement + // Only token_embd and output.weight are enhanced (reduces overhead significantly) + // Research shows attn_v enhancement provides <0.05% PPL improvement at >10B + return 0.0f; } } @@ -260,9 +284,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { new_type = GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) { - // Q4_HIFI: Q6_K_HIFI_RES8 (Q6_K + INT8 residuals) on output - always critical - new_type = GGML_TYPE_Q6_K_HIFI_RES8; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) { + // Q4_K_HIFI: Use size-aware HIFI type on output - always critical + // Q5_K_HIFI_RES8 for 4B-10B, Q6_K_HIFI_RES8 for smaller models + const float model_params_b = compute_model_params_b(qs.model.hparams, qs.model.vocab.n_tokens()); + new_type = get_hifi_enhanced_type(model_params_b); } else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; @@ -293,9 +319,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) { - // Q4_HIFI: Q6_K_HIFI_RES8 (Q6_K + INT8 residuals) on token embeddings - always critical - new_type = GGML_TYPE_Q6_K_HIFI_RES8; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) { + // Q4_K_HIFI: Use size-aware HIFI type on token embeddings - always critical + // Q5_K_HIFI_RES8 for 4B-10B, Q6_K_HIFI_RES8 for smaller models + const float model_params_b = compute_model_params_b(qs.model.hparams, qs.model.vocab.n_tokens()); + new_type = get_hifi_enhanced_type(model_params_b); } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || @@ -341,18 +369,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) { - // Q4_HIFI: Model-size-aware enhancement to optimize size vs quality tradeoff - // - Small models (≤2B): enhance 50% of attn_v layers (high ROI) - // - Medium models (2-8B): enhance 30% of attn_v layers (moderate ROI) - // - Large models (>8B): enhance 15% of attn_v layers (diminishing returns) + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) { + // Q4_K_HIFI: Model-size-aware enhancement to optimize size vs quality tradeoff + // - Small models (≤2B): Q6_K_HIFI_RES8, enhance 50% of attn_v layers (high ROI) + // - Medium models (4B-10B): Q5_K_HIFI_RES8, enhance 30% of layers (optimal BPW) + // - Large models (>10B): Q5_K_HIFI_RES8, enhance 15% of layers (diminishing returns) // This reduces enhanced tensor count significantly for large models while // preserving quality where it matters (early layers + embeddings) const float model_params_b = compute_model_params_b(qs.model.hparams, qs.model.vocab.n_tokens()); const float enhancement_threshold = get_hifi_enhancement_threshold(model_params_b); + const ggml_type hifi_type = get_hifi_enhanced_type(model_params_b); if (qs.i_attention_wv <= qs.n_attention_wv * enhancement_threshold) { - new_type = GGML_TYPE_Q6_K_HIFI_RES8; + new_type = hifi_type; // Use size-appropriate HIFI type } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) { new_type = GGML_TYPE_Q6_K; // Follow Q4_K_M behavior for critical late layers } @@ -418,8 +447,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) { - // Q4_HIFI follows Q4_K_M behavior for ffn_down + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) { + // Q4_K_HIFI follows Q4_K_M behavior for ffn_down if (arch == LLM_ARCH_FALCON) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K : use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; @@ -466,7 +495,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate") != std::string::npos) { @@ -652,7 +681,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_M: default_type = GGML_TYPE_IQ3_S; break; - case LLAMA_FTYPE_MOSTLY_Q4_HIFI: default_type = GGML_TYPE_Q4_K; break; // Q4_K_M + dynamic outliers + early exit + case LLAMA_FTYPE_MOSTLY_Q4_K_HIFI: default_type = GGML_TYPE_Q4_K; break; // Q4_K_M + dynamic outliers + early exit default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -723,8 +752,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV // Set quantization type string for Hugging Face model card display - if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) { - gguf_set_val_str(ctx_out.get(), "general.quantization_type", "Q4_HIFI"); + if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) { + gguf_set_val_str(ctx_out.get(), "general.quantization_type", "Q4_K_HIFI"); } // Remove split metadata @@ -1070,7 +1099,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: ggml_hifi_quant_context hifi_ctx = {}; const ggml_hifi_quant_context * hifi_ctx_ptr = nullptr; - if (new_type == GGML_TYPE_Q6_K_HIFI_RES8 && ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) { + // Handle both Q6_K_HIFI_RES8 and Q5_K_HIFI_RES8 HIFI types + const bool is_hifi_type = (new_type == GGML_TYPE_Q6_K_HIFI_RES8 || new_type == GGML_TYPE_Q5_K_HIFI_RES8); + if (is_hifi_type && ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) { // Extract layer index from tensor name (e.g., "blk.5.attn_v.weight" -> 5) int layer_idx = -1; if (sscanf(name.c_str(), "blk.%d.", &layer_idx) != 1) { @@ -1105,14 +1136,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } // Compute adaptive outlier count + // Use the appropriate max outliers constant based on type + const int max_outliers = (new_type == GGML_TYPE_Q5_K_HIFI_RES8) + ? Q5_K_HIFI_RES8_MAX_OUTLIERS : Q6_K_HIFI_RES8_MAX_OUTLIERS; int outlier_count; if (layer_idx < 0) { // Critical non-layer tensors (token_embd, output.weight): max outliers - outlier_count = Q6_K_HIFI_RES8_MAX_OUTLIERS; + outlier_count = max_outliers; } else { outlier_count = ggml_hifi_compute_outlier_count( layer_idx, n_layers, layer_importance, model_params_b ); + // Clamp to the type's max outliers + if (outlier_count > max_outliers) outlier_count = max_outliers; } // Set up context @@ -1125,8 +1161,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: hifi_ctx_ptr = &hifi_ctx; // Log adaptive outlier allocation (INFO level for visibility) - LLAMA_LOG_INFO("(HIFI: model=%.1fB layer=%d/%d imp=%.2f outliers=%d) ", - model_params_b, layer_idx, n_layers, layer_importance, outlier_count); + const char * type_name = (new_type == GGML_TYPE_Q5_K_HIFI_RES8) ? "Q5_K_HIFI" : "Q6_K_HIFI"; + LLAMA_LOG_INFO("(%s: model=%.1fB layer=%d/%d imp=%.2f outliers=%d) ", + type_name, model_params_b, layer_idx, n_layers, layer_importance, outlier_count); } for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 034cc2f41a4..f4d775c070c 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -43,7 +43,7 @@ static const std::vector QUANT_OPTIONS = { { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", }, - { "Q4_HIFI", LLAMA_FTYPE_MOSTLY_Q4_HIFI, " ~4.95 bpw Q4_K_M + INT8 residuals (best quality-per-byte)", }, + { "Q4_K_HIFI", LLAMA_FTYPE_MOSTLY_Q4_K_HIFI, " ~4.95 bpw Q4_K_M + INT8 residuals (best quality-per-byte)", }, { "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", }, { "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },