diff --git a/Q4_HIFI_ROADMAP.md b/Q4_K_HIFI_ROADMAP.md
similarity index 83%
rename from Q4_HIFI_ROADMAP.md
rename to Q4_K_HIFI_ROADMAP.md
index ee6d8c329dc..72bface8375 100644
--- a/Q4_HIFI_ROADMAP.md
+++ b/Q4_K_HIFI_ROADMAP.md
@@ -13,7 +13,7 @@ Geoff Munn​
 | Finding | Strategic Implication |
 |--------|------------------------|
 | ✅ **Q3_HIFI excels on ≤2B models** | Outlier preservation + Q3_K base = optimal for small models |
-| ❌ **Q4_HIFI fails on ≥4B models** | Sparse outliers can’t fix aggressive 4-bit base quantization |
+| ❌ **Q4_K_HIFI fails on ≥4B models** | Sparse outliers can't fix aggressive 4-bit base quantization |
 | ✅ **Q4_K_M wins via Q6_K on key tensors** | Uniform higher precision > sparse outliers at scale |
 | ✅ **Early layers & embeddings matter most** | Precision should focus on `attn_v`, `ffn_gate`, `token_embd` |
 | ✅ **Domain-mixed imatrix is essential** | 60% Wikitext, 25% Code, 15% Math for balanced outlier selection |
@@ -25,8 +25,8 @@ Geoff Munn​
 | Format | Model Size | Strategy | Base Precision | Enhancement |
 |--------|------------|----------|----------------|-------------|
 | **Q3_HIFI** | **≤2B** | Outlier preservation | Q3_K | 8 FP16 outliers on early layers |
-| **Q4_HIFI_M** | **3–10B** | Smart Q5_K allocation | Q4_K + Q5_K | Q5_K on sensitive tensors |
-| **Q4_HIFI_L** | **>10B** | Q4_K_M + precision refinement | Q4_K + Q6_K | 6 FP16 outliers on Q6_K tensors |
+| **Q4_K_HIFI_M** | **3–10B** | Smart Q5_K allocation | Q4_K + Q5_K | Q5_K on sensitive tensors |
+| **Q4_K_HIFI_L** | **>10B** | Q4_K_M + precision refinement | Q4_K + Q6_K | 6 FP16 outliers on Q6_K tensors |
 
 ---
 
@@ -53,7 +53,7 @@ static bool is_q3_hifi_tensor(const char* name, int layer_idx) {
 
 ---
 
-## 🚀 **Phase 2: Q4_HIFI_M — Smart Q5_K Allocation (3–10B Models)**
+## 🚀 **Phase 2: Q4_K_HIFI_M — Smart Q5_K Allocation (3–10B Models)**
 
 ### 🎯 **Objective**: Beat Q4_K_M by **replacing Q4_K with Q5_K on sensitive tensors**.
 
@@ -81,7 +81,7 @@ static ggml_type get_q4_hifi_m_tensor_type(const char* tensor_name) {
 ```
 
 ### 📊 **Expected Results (Qwen3-4B)**
-| Metric | Q4_K_M | **Q4_HIFI_M** |
+| Metric | Q4_K_M | **Q4_K_HIFI_M** |
 |--------|--------|---------------|
 | **PPL** | 14.79 | **14.55–14.65** ✅ |
 | **Speed** | 200 t/s | **196–198 t/s** ✅ |
@@ -89,7 +89,7 @@ static ggml_type get_q4_hifi_m_tensor_type(const char* tensor_name) {
 
 ---
 
-## 🚀 **Phase 3: Q4_HIFI_L — Q4_K_M + Strategic Outliers (>10B Models)**
+## 🚀 **Phase 3: Q4_K_HIFI_L — Q4_K_M + Strategic Outliers (>10B Models)**
 
 ### 🎯 **Objective**: Squeeze extra quality from Q4_K_M on massive models.
 
@@ -116,7 +116,7 @@ static ggml_type get_q4_hifi_l_tensor_type(const char* tensor_name) {
 ```
 
 ### 📊 **Expected Results (Devstral-123B)**
-| Metric | Q4_K_S | **Q4_HIFI_L** |
+| Metric | Q4_K_S | **Q4_K_HIFI_L** |
 |--------|--------|---------------|
 | **PPL** | 11.24 | **11.10–11.15** ✅ |
 | **Speed** | 9.75 t/s | **9.65 t/s** ✅ |
@@ -152,7 +152,7 @@ void quantize_hifi_family(...) {
 ./llama-quantize --hifi model-f16.gguf model-hifi.gguf
 
 # Manual override
-./llama-quantize --quant-type Q4_HIFI_M model-f16.gguf model-hifi-m.gguf
+./llama-quantize --quant-type Q4_K_HIFI_M model-f16.gguf model-hifi-m.gguf
 ```
 
 ### **Step 3: Documentation**
@@ -162,8 +162,8 @@ void quantize_hifi_family(...) {
 | Model Size | Command | Best For |
 |------------|---------|----------|
 | ≤2B | `--hifi` | Qwen-0.6B, Phi-3, Gemma-2B |
-| 3–10B | `--quant-type Q4_HIFI_M` | Qwen-4B, Llama-3-8B, Mistral-7B |
-| >10B | `--quant-type Q4_HIFI_L` | Distrill-123B, Llama-3-70B |
+| 3–10B | `--quant-type Q4_K_HIFI_M` | Qwen-4B, Llama-3-8B, Mistral-7B |
+| >10B | `--quant-type Q4_K_HIFI_L` | Distrill-123B, Llama-3-70B |
 ```
 
 ---
@@ -174,8 +174,8 @@ void quantize_hifi_family(...) {
 |-------|-------------|-----|-------|------|
 | **Qwen3-0.6B** | **Q3_HIFI** | **23.42** | 593 t/s | 469 MiB |
 | **Qwen3-1.7B** | **Q3_HIFI** | **17.96** | 385 t/s | 1.22 GiB |
-| **Qwen3-4B** | **Q4_HIFI_M** | **14.60** | 197 t/s | 2.36 GiB |
-| **Devstral-123B** | **Q4_HIFI_L** | **11.12** | 9.65 t/s | 66.7 GiB |
+| **Qwen3-4B** | **Q4_K_HIFI_M** | **14.60** | 197 t/s | 2.36 GiB |
+| **Devstral-123B** | **Q4_K_HIFI_L** | **11.12** | 9.65 t/s | 66.7 GiB |
 
 ---
 
@@ -184,7 +184,7 @@ void quantize_hifi_family(...) {
 1. **No more forcing one format to scale** — each size gets its optimal strategy 
 2. **Builds on proven wins** — Q3_HIFI works, Q4_K_M works, now combine intelligently 
 3. **Minimal complexity** — no residual quantization, no INT8 experiments 
-4. **Clear user guidance** — “Use HIFI, we’ll pick the right variant”
+4. **Clear user guidance** — "Use HIFI, we'll pick the right variant"
 
 ---
 
@@ -193,13 +193,14 @@ void quantize_hifi_family(...) {
 | Phase | Task | Timeline |
 |-------|------|----------|
 | **1** | Q3_HIFI revival (reset + validate) | 3 days |
-| **2** | Q4_HIFI_M implementation | 3 days |
-| **3** | Q4_HIFI_L implementation | 4 days |
+| **2** | Q4_K_HIFI_M implementation | 3 days |
+| **3** | Q4_K_HIFI_L implementation | 4 days |
 | **4** | Unified CLI + documentation | 2 days |
 | **5** | Upstream PR preparation | 2 days |
 
 ---
 
-This roadmap **honors your discoveries** while **avoiding known pitfalls**. You’re not starting over — you’re **focusing your proven strengths** where they matter most.
+This roadmap **honors your discoveries** while **avoiding known pitfalls**. You're not starting over — you're **focusing your proven strengths** where they matter most.
+
+**The HIFI family will be the first quantization approach that truly adapts to model scale — delivering optimal quality, speed, and size at every level.**
 
-**The HIFI family will be the first quantization approach that truly adapts to model scale — delivering optimal quality, speed, and size at every level.**
\ No newline at end of file
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index cf3649130be..9a033e87f13 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -429,7 +429,8 @@ extern "C" {
         GGML_TYPE_Q6_K_HIFI = 41, // Q6_K_HIFI: Q6_K layout + 4 FP16 outliers for critical tensors
         GGML_TYPE_Q6_K_HIFI_DYNAMIC = 42, // Q6_K_HIFI_DYNAMIC: Q6_K + 2-8 outliers based on layer sensitivity
         GGML_TYPE_Q6_K_HIFI_RES8 = 43, // Q6_K_HIFI_RES8: Q6_K + INT8 residuals (compact format)
-        GGML_TYPE_COUNT   = 44,
+        GGML_TYPE_Q5_K_HIFI_RES8 = 44, // Q5_K_HIFI_RES8: Q5_K + INT8 residuals (efficient for 4B-10B models)
+        GGML_TYPE_COUNT   = 45,
     };
 
     // precision
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 14bcfe0e0ee..3d78cf9c0c1 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -415,6 +415,34 @@ typedef struct {
 // Total: 232 bytes (210 + 22) - saves 4 bytes/block vs Q6_K_HIFI_DYNAMIC
 static_assert(sizeof(block_q6_k_hifi_res8) == 232, "wrong q6_k_hifi_res8 block size/padding");
 
+// Q5_K_HIFI_RES8: Efficient Q5_K with INT8 residuals for 4B-10B models
+// This format is optimized for mid-scale models where Q6_K overhead is wasteful.
+// Q5_K base provides sufficient precision, outliers compensate for 1-bit loss.
+// Size: 200 bytes vs Q6_K_HIFI_RES8's 232 bytes (~14% smaller)
+// Expected results: matches Q6_K_HIFI_RES8 quality at better BPW efficiency
+#define Q5_K_HIFI_RES8_MAX_OUTLIERS 8
+typedef struct {
+    // === Q5_K-COMPATIBLE REGION (176 bytes) - DO NOT REORDER ===
+    GGML_EXTENSION union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t scales[K_SCALE_SIZE]; // 12 bytes: scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // 32 bytes: quants, high bit
+    uint8_t qs[QK_K/2];           // 128 bytes: quants, low 4 bits
+    // === COMPACT INT8 RESIDUAL EXTENSION (24 bytes) ===
+    uint8_t outlier_count;                               // 1 byte: actual outlier count (1-8)
+    uint8_t outlier_idx[Q5_K_HIFI_RES8_MAX_OUTLIERS];    // 8 bytes: outlier positions (0-255)
+    int8_t  residual_vals[Q5_K_HIFI_RES8_MAX_OUTLIERS];  // 8 bytes: INT8 residuals (-127 to +127)
+    uint8_t _padding[3];                                 // 3 bytes: padding for float alignment
+    float   residual_scale;                              // 4 bytes: shared scale for residuals
+} block_q5_k_hifi_res8;
+// Total: 200 bytes (176 + 24) - 14% smaller than Q6_K_HIFI_RES8
+static_assert(sizeof(block_q5_k_hifi_res8) == 200, "wrong q5_k_hifi_res8 block size/padding");
+
 // This is only used for intermediate quantization and dot products
 typedef struct {
     float   d;              // delta
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 21ed1699e41..636410ac8d9 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -303,6 +303,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .vec_dot_type             = GGML_TYPE_Q8_K,
         .nrows                    = 1,
     },
+    [GGML_TYPE_Q5_K_HIFI_RES8] = {
+        .from_float               = quantize_row_q5_k_hifi_res8,  // 3-arg wrapper (matches Q6_K_HIFI_RES8 pattern)
+        .vec_dot                  = ggml_vec_dot_q5_k_hifi_res8_q8_K, // Efficient Q5_K + INT8 residuals kernel
+        .vec_dot_type             = GGML_TYPE_Q8_K,
+        .nrows                    = 1,
+    },
     [GGML_TYPE_Q4_K] = {
         .from_float               = quantize_row_q4_K,
         .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index f03e743fc08..8cf01905477 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -676,6 +676,7 @@ void ggml_compute_forward_add(
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
         case GGML_TYPE_Q6_K_HIFI_RES8:
+        case GGML_TYPE_Q5_K_HIFI_RES8:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
@@ -1129,6 +1130,7 @@ void ggml_compute_forward_add1(
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
         case GGML_TYPE_Q6_K_HIFI_RES8:
+        case GGML_TYPE_Q5_K_HIFI_RES8:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
@@ -1261,6 +1263,7 @@ void ggml_compute_forward_acc(
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
         case GGML_TYPE_Q6_K_HIFI_RES8:
+        case GGML_TYPE_Q5_K_HIFI_RES8:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
@@ -4288,6 +4291,7 @@ void ggml_compute_forward_out_prod(
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
         case GGML_TYPE_Q6_K_HIFI_RES8:
+        case GGML_TYPE_Q5_K_HIFI_RES8:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
@@ -4567,6 +4571,7 @@ void ggml_compute_forward_set(
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
         case GGML_TYPE_Q6_K_HIFI_RES8:
+        case GGML_TYPE_Q5_K_HIFI_RES8:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
@@ -4793,6 +4798,7 @@ void ggml_compute_forward_get_rows(
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
         case GGML_TYPE_Q6_K_HIFI_RES8:
+        case GGML_TYPE_Q5_K_HIFI_RES8:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
@@ -5521,6 +5527,7 @@ void ggml_compute_forward_clamp(
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
         case GGML_TYPE_Q6_K_HIFI_RES8:
+        case GGML_TYPE_Q5_K_HIFI_RES8:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
         case GGML_TYPE_Q6_K:
diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c
index 6769caeaaeb..e12d06caff8 100644
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@@ -1019,6 +1019,107 @@ void ggml_vec_dot_q6_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs,
     *s = sumf;
 }
 
+// Q5_K_HIFI_RES8: Efficient Q5_K base + INT8 residuals for 4B-10B models
+// Uses same correction strategy as Q6_K_HIFI_RES8, but with Q5_K base for better BPW
+void ggml_vec_dot_q5_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+
+    const block_q5_k_hifi_res8 * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    static const uint32_t kmask1 = 0x3f3f3f3f;
+    static const uint32_t kmask2 = 0x0f0f0f0f;
+    static const uint32_t kmask3 = 0x03030303;
+
+    uint32_t utmp[4];
+    const uint8_t * scales = (const uint8_t*)&utmp[0];
+    const uint8_t * mins   = (const uint8_t*)&utmp[2];
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        // === Q5_K bulk dot product (same as ggml_vec_dot_q5_K_q8_K_generic) ===
+        const uint8_t * GGML_RESTRICT q4 = x[i].qs;
+        const uint8_t * GGML_RESTRICT hm = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        uint8_t m = 1;
+        for (int j = 0; j < QK_K; j += 64) {
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF) + (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l]  >> 4) + (hm[l] & m ? 16 : 0);
+            a += 32; m <<= 1;
+            q4 += 32;
+        }
+        memcpy(utmp, x[i].scales, 12);
+        utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
+        const uint32_t uaux = utmp[1] & kmask1;
+        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
+        utmp[2] = uaux;
+        utmp[0] &= kmask1;
+
+        int sumi = 0;
+        for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/32; ++j) {
+            int32_t scale = scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+        const float dmin = GGML_CPU_FP16_TO_FP32(x[i].dmin) * y[i].d;
+        sumf -= dmin * sumi;
+
+        // === INT8 RESIDUAL CORRECTION ===
+        // Add residual * activation corrections at outlier positions
+        const int outlier_count = x[i].outlier_count;
+        const float res_scale = x[i].residual_scale;
+        const float d8 = y[i].d;
+        const float scale_factor = res_scale * (1.0f / 127.0f) * d8;
+        for (int k = 0; k < outlier_count; ++k) {
+            const int idx = x[i].outlier_idx[k];
+            const int8_t activation = y[i].qs[idx];
+            // Early exit: skip if activation is too small (same threshold as Q6_K_HIFI)
+            if (activation > 4 || activation < -4) {
+                const float residual = x[i].residual_vals[k] * scale_factor;
+                sumf += residual * activation;
+            }
+        }
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+}
+
+// Wrapper for quantize_row_q5_k_hifi_res8 (simple version)
+void quantize_row_q5_k_hifi_res8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q5_k_hifi_res8_ref(x, (block_q5_k_hifi_res8 *)y, k);
+}
+
 void ggml_vec_dot_iq2_xxs_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h
index 0bd5b741cb9..76548c4caf6 100644
--- a/ggml/src/ggml-cpu/quants.h
+++ b/ggml/src/ggml-cpu/quants.h
@@ -30,6 +30,8 @@ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
 void quantize_row_q6_k_hifi(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_k_hifi_dynamic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_k_hifi_res8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+void quantize_row_q5_k_hifi_res8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
+size_t quantize_q5_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
 void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
@@ -56,6 +58,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q6_k_hifi_dynamic_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q6_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_k_hifi_res8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index d2732f0d330..a8a492394ab 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -853,6 +853,13 @@ struct ggml_cuda_type_traits<GGML_TYPE_Q6_K_HIFI_RES8> {
     static constexpr int qi = QI6_K;
 };
 
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q5_K_HIFI_RES8> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR5_K;
+    static constexpr int qi = QI5_K;
+};
+
 template<>
 struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
     static constexpr int qk = QK_K;
diff --git a/ggml/src/ggml-cuda/convert.cu b/ggml/src/ggml-cuda/convert.cu
index 9c15a411e44..4f17fed8c52 100644
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -407,6 +407,56 @@ static __global__ void dequantize_block_q6_k_hifi_res8(const void * __restrict__
     }
 }
 
+// Q5_K_HIFI_RES8: Efficient Q5_K base with INT8 residuals for 4B-10B models
+template<typename dst_t>
+static __global__ void dequantize_block_q5_k_hifi_res8(const void * __restrict__ vx, dst_t * __restrict__ yy) {
+    const block_q5_k_hifi_res8 * x = (const block_q5_k_hifi_res8 *) vx;
+
+    const int64_t i = blockIdx.x;
+
+    // Q5_K bulk dequantization (same as dequantize_block_q5_K)
+    const int64_t tid = threadIdx.x;
+    const int64_t il  = tid/16;   // il is in 0...3
+    const int64_t ir  = tid%16;   // ir is in 0...15
+    const int64_t is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = __low2half(x[i].dm);
+    const float dmin = __high2half(x[i].dm);
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+
+    // Thread 0 handles INT8 residual corrections
+    __syncthreads();
+    if (threadIdx.x == 0) {
+        dst_t * yb = yy + i*QK_K;
+        const int outlier_count = x[i].outlier_count;
+        const float res_scale = x[i].residual_scale;
+        const float scale_factor = res_scale * (1.0f / 127.0f);
+        // Add residual corrections at outlier positions
+        for (int k = 0; k < outlier_count && k < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k) {
+            const int idx = x[i].outlier_idx[k];
+            const float residual = x[i].residual_vals[k] * scale_factor;
+            yb[idx] += residual;
+        }
+    }
+}
+
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
 
@@ -743,6 +793,12 @@ static void dequantize_row_q6_k_hifi_res8_cuda(const void * vx, dst_t * y, const
     dequantize_block_q6_k_hifi_res8<<<nb, 64, 0, stream>>>(vx, y);
 }
 
+template<typename dst_t>
+static void dequantize_row_q5_k_hifi_res8_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
+    const int nb = k / QK_K;
+    dequantize_block_q5_k_hifi_res8<<<nb, 64, 0, stream>>>(vx, y);
+}
+
 template<typename dst_t>
 static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) {
     const int nb = k / QK_K;
@@ -876,6 +932,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
             return dequantize_row_q6_k_hifi_dynamic_cuda;
         case GGML_TYPE_Q6_K_HIFI_RES8:
             return dequantize_row_q6_k_hifi_res8_cuda;
+        case GGML_TYPE_Q5_K_HIFI_RES8:
+            return dequantize_row_q5_k_hifi_res8_cuda;
         case GGML_TYPE_Q4_K:
             return dequantize_row_q4_K_cuda;
         case GGML_TYPE_Q5_K:
@@ -935,6 +993,8 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
             return dequantize_row_q6_k_hifi_dynamic_cuda;
         case GGML_TYPE_Q6_K_HIFI_RES8:
             return dequantize_row_q6_k_hifi_res8_cuda;
+        case GGML_TYPE_Q5_K_HIFI_RES8:
+            return dequantize_row_q5_k_hifi_res8_cuda;
         case GGML_TYPE_Q4_K:
             return dequantize_row_q4_K_cuda;
         case GGML_TYPE_Q5_K:
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index af00aee2ea7..06e1816f3fa 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -4386,6 +4386,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                     case GGML_TYPE_Q6_K_HIFI:
                     case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
                     case GGML_TYPE_Q6_K_HIFI_RES8:
+                    case GGML_TYPE_Q5_K_HIFI_RES8:
                     case GGML_TYPE_Q4_K:
                     case GGML_TYPE_Q5_K:
                     case GGML_TYPE_Q6_K:
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 5a0d6c9e439..5dd8318604b 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -21,6 +21,7 @@ static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type)
         case GGML_TYPE_Q6_K_HIFI: return vec_dot_q6_K_q8_1;  // Reuse Q6_K kernel
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC: return vec_dot_q6_K_q8_1;  // Reuse Q6_K kernel
         case GGML_TYPE_Q6_K_HIFI_RES8: return vec_dot_q6_k_hifi_res8_q8_1;  // HIFI kernel with residual corrections
+        case GGML_TYPE_Q5_K_HIFI_RES8: return vec_dot_q5_k_hifi_res8_q8_1;  // HIFI kernel with residual corrections
         case GGML_TYPE_Q4_K:    return vec_dot_q4_K_q8_1;
         case GGML_TYPE_Q5_K:    return vec_dot_q5_K_q8_1;
         case GGML_TYPE_Q6_K:    return vec_dot_q6_K_q8_1;
@@ -51,6 +52,7 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
         case GGML_TYPE_Q6_K_HIFI: return VDR_Q6_K_Q8_1_MMVQ;  // Same as Q6_K
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC: return VDR_Q6_K_Q8_1_MMVQ;  // Same as Q6_K
         case GGML_TYPE_Q6_K_HIFI_RES8: return VDR_Q6_K_Q8_1_MMVQ;  // Same as Q6_K
+        case GGML_TYPE_Q5_K_HIFI_RES8: return VDR_Q5_K_Q8_1_MMVQ;  // Same as Q5_K
         case GGML_TYPE_Q4_K:    return VDR_Q4_K_Q8_1_MMVQ;
         case GGML_TYPE_Q5_K:    return VDR_Q5_K_Q8_1_MMVQ;
         case GGML_TYPE_Q6_K:    return VDR_Q6_K_Q8_1_MMVQ;
@@ -574,6 +576,12 @@ static void mul_mat_vec_q_switch_type(
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
             break;
+        case GGML_TYPE_Q5_K_HIFI_RES8:
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K_HIFI_RES8>  // Q5_K HIFI with residual corrections
+                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            break;
         case GGML_TYPE_IQ2_XXS:
             mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
                 (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
index 103ed3e802a..6b1548da982 100644
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -1022,6 +1022,80 @@ static __device__ __forceinline__ float vec_dot_q6_k_hifi_res8_q8_1(
     return sum;
 }
 
+// Q5_K_HIFI_RES8: Q5_K layout + INT8 residuals + per-block scale
+// Efficient format for 4B-10B models with Q5_K base (176 bytes vs Q6_K's 210)
+#define VDR_Q5_K_HIFI_RES8_Q8_1_MMVQ VDR_Q5_K_Q8_1_MMVQ
+
+static __device__ __forceinline__ float vec_dot_q5_k_hifi_res8_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+
+    const block_q5_k_hifi_res8 * bq5_hifi = (const block_q5_k_hifi_res8 *) vbq + kbx;
+
+    // === Q5_K bulk dot product (same as vec_dot_q5_K_q8_1) ===
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_hifi->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_hifi->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_hifi->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = __low2float(bq8i->ds);
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    float sum = vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_hifi->dm, d8);
+
+    // === INT8 RESIDUAL CORRECTION ===
+    const int outlier_count = bq5_hifi->outlier_count;
+    
+    if (outlier_count > 0) {
+        const float res_scale = bq5_hifi->residual_scale * (1.0f / 127.0f);
+        
+        // Only thread 0 in the warp group for this block computes the residual correction
+        if (iqs == 0) {
+            for (int k = 0; k < outlier_count && k < 8; ++k) {
+                const int idx = bq5_hifi->outlier_idx[k];
+                const int idx_bq8 = idx / QK8_1;
+                const int idx_in_bq8 = idx % QK8_1;
+                
+                const int8_t q8_val = ((const int8_t*)bq8_1[idx_bq8].qs)[idx_in_bq8];
+                const float d8_val = __low2float(bq8_1[idx_bq8].ds);
+                const float residual = res_scale * bq5_hifi->residual_vals[k];
+                sum += residual * q8_val * d8_val;
+            }
+        }
+    }
+
+    return sum;
+}
+
 #define VDR_IQ2_XXS_Q8_1_MMVQ 2
 #define VDR_IQ2_XXS_Q8_1_MMQ  2
 
diff --git a/ggml/src/ggml-quants-hifi.c b/ggml/src/ggml-quants-hifi.c
index fc9878acaa3..54f01f727ca 100644
--- a/ggml/src/ggml-quants-hifi.c
+++ b/ggml/src/ggml-quants-hifi.c
@@ -1,5 +1,5 @@
 // GGML HIFI Quantization Context Implementation
-// Layer-adaptive outlier allocation for Q4_HIFI quantization
+// Layer-adaptive outlier allocation for Q4_K_HIFI quantization
 
 #include "ggml-quants-hifi.h"
 #include <math.h>
@@ -151,3 +151,90 @@ float ggml_hifi_compute_tensor_importance(
     return importance;
 }
 
+// Strategy 1: Compute per-block importance from imatrix data
+// Uses coefficient of variation within the block as the importance metric
+float ggml_hifi_compute_block_importance(
+    const float * imatrix_block,
+    int block_size
+) {
+    if (imatrix_block == NULL || block_size <= 0) {
+        return 0.5f;  // Default to medium importance
+    }
+
+    // Compute statistics for this block
+    double sum = 0.0;
+    double sum_sq = 0.0;
+    double max_val = 0.0;
+    
+    for (int i = 0; i < block_size; ++i) {
+        double val = (double)imatrix_block[i];
+        sum += val;
+        sum_sq += val * val;
+        if (val > max_val) max_val = val;
+    }
+
+    double mean = sum / (double)block_size;
+    if (mean < 1e-10) {
+        return 0.3f;  // Low importance for near-zero blocks
+    }
+
+    double mean_sq = sum_sq / (double)block_size;
+    double variance = mean_sq - mean * mean;
+    if (variance < 0) variance = 0;
+
+    // Coefficient of variation (CV)
+    double stddev = sqrt(variance);
+    double cv = stddev / mean;
+
+    // Also consider the max/mean ratio (spikiness)
+    double spikiness = max_val / mean;
+
+    // Combine CV and spikiness for final importance
+    // High CV = high variance = some weights are outliers = need more outliers
+    // High spikiness = extreme values present = need more outliers
+    double combined = 0.6 * cv + 0.4 * (spikiness / 10.0);  // spikiness typically 1-20
+    
+    // Normalize to 0.2 - 0.9 range
+    float importance = 0.2f + 0.7f * (float)(combined / 2.0);  // combined typically 0-3
+    if (importance > 0.9f) importance = 0.9f;
+    if (importance < 0.2f) importance = 0.2f;
+
+    return importance;
+}
+
+// Strategy 1: Compute per-block outlier count based on local imatrix variance
+// Adjusts the base outlier count up or down based on block importance
+int ggml_hifi_compute_block_outlier_count(
+    float block_importance,
+    int base_outlier_count,
+    float model_params_b
+) {
+    // Scale factor based on block importance
+    // High importance (>0.7): boost outliers up to 1.5x
+    // Low importance (<0.3): reduce outliers down to 0.5x
+    // Medium importance: keep base count
+    float scale = 1.0f;
+    
+    if (block_importance > 0.7f) {
+        // High importance block - boost outliers
+        scale = 1.0f + 0.5f * (block_importance - 0.7f) / 0.3f;  // 1.0 to 1.5
+    } else if (block_importance < 0.3f) {
+        // Low importance block - reduce outliers
+        scale = 0.5f + 0.5f * (block_importance / 0.3f);  // 0.5 to 1.0
+    }
+    
+    // For larger models, be more aggressive with reduction on low-importance blocks
+    if (model_params_b >= 7.0f && block_importance < 0.4f) {
+        scale *= 0.8f;  // Additional 20% reduction for large models
+    }
+    
+    int adjusted_count = (int)roundf((float)base_outlier_count * scale);
+    
+    // Clamp to valid range [1, 8]
+    // Allow minimum of 1 for low-importance blocks (save more space)
+    if (adjusted_count < 1) adjusted_count = 1;
+    if (adjusted_count > 8) adjusted_count = 8;
+    
+    return adjusted_count;
+}
+
diff --git a/ggml/src/ggml-quants-hifi.h b/ggml/src/ggml-quants-hifi.h
index 919bbcca728..89a0b8ba823 100644
--- a/ggml/src/ggml-quants-hifi.h
+++ b/ggml/src/ggml-quants-hifi.h
@@ -1,5 +1,5 @@
 // GGML HIFI Quantization Context
-// Provides layer-adaptive outlier allocation for Q4_HIFI quantization
+// Provides layer-adaptive outlier allocation for Q4_K_HIFI quantization
 //
 // This header defines the context infrastructure for passing layer-specific
 // parameters to the quantization functions without modifying the core GGML API.
@@ -20,6 +20,12 @@ extern "C" {
 #define Q6_K_HIFI_RES8_MAX_OUTLIERS 8
 #endif
 
+// Maximum outliers per block for Q5_K_HIFI_RES8 format
+// Must match the value in ggml-common.h
+#ifndef Q5_K_HIFI_RES8_MAX_OUTLIERS
+#define Q5_K_HIFI_RES8_MAX_OUTLIERS 8
+#endif
+
 // Layer-adaptive quantization context
 // Used to pass dynamic parameters to Q6_K_HIFI_RES8 quantization
 typedef struct {
@@ -63,6 +69,30 @@ GGML_API float ggml_hifi_compute_tensor_importance(
     int64_t n_elements
 );
 
+// Strategy 1: Compute per-block importance from imatrix data
+// Used for adaptive per-block outlier allocation
+// Parameters:
+//   imatrix_block: Per-element importance weights for this block (QK_K elements)
+//   block_size: Number of elements in the block (typically QK_K = 256)
+// Returns: Block importance score (0.0-1.0)
+GGML_API float ggml_hifi_compute_block_importance(
+    const float * imatrix_block,
+    int block_size
+);
+
+// Strategy 1: Compute per-block outlier count based on local imatrix variance
+// High variance blocks get more outliers, low variance blocks get fewer
+// Parameters:
+//   block_importance: Importance score for this block (0.0-1.0)
+//   base_outlier_count: Base outlier count from tensor-level computation
+//   model_params_b: Model size in billions
+// Returns: Adjusted outlier count for this block (2-8)
+GGML_API int ggml_hifi_compute_block_outlier_count(
+    float block_importance,
+    int base_outlier_count,
+    float model_params_b
+);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 4eec5c6a6e7..43cf95f7b60 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -2441,19 +2441,35 @@ void quantize_row_q6_k_hifi_res8_ref(const float * GGML_RESTRICT x, block_q6_k_h
     quantize_row_q6_k_hifi_res8_ref_ex(x, y, k, Q6_K_HIFI_RES8_MAX_OUTLIERS);
 }
 
-// imatrix-aware quantization implementation
-static void quantize_row_q6_k_hifi_res8_impl(const float * GGML_RESTRICT x, block_q6_k_hifi_res8 * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights, int outlier_count) {
+// imatrix-aware quantization implementation with per-block adaptive outliers (Strategy 1)
+static void quantize_row_q6_k_hifi_res8_impl(const float * GGML_RESTRICT x, block_q6_k_hifi_res8 * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights, int base_outlier_count) {
     assert(k % QK_K == 0);
     const int64_t nb = k / QK_K;
 
-    if (outlier_count < 1) outlier_count = 1;
-    if (outlier_count > Q6_K_HIFI_RES8_MAX_OUTLIERS) outlier_count = Q6_K_HIFI_RES8_MAX_OUTLIERS;
+    if (base_outlier_count < 1) base_outlier_count = 1;
+    if (base_outlier_count > Q6_K_HIFI_RES8_MAX_OUTLIERS) base_outlier_count = Q6_K_HIFI_RES8_MAX_OUTLIERS;
+
+    // Get model size from HIFI context for per-block adaptation
+    float model_params_b = 1.0f;  // Default to 1B for Q6_K (small models)
+    const ggml_hifi_quant_context * hifi_ctx = ggml_hifi_get_context();
+    if (hifi_ctx && hifi_ctx->is_active) {
+        model_params_b = hifi_ctx->model_params_b;
+    }
 
     for (int64_t ib = 0; ib < nb; ++ib) {
         const float * xb = x + ib * QK_K;
         const float * qw = quant_weights ? quant_weights + ib * QK_K : NULL;
         block_q6_k_hifi_res8 * block = &y[ib];
 
+        // Strategy 1: Compute per-block adaptive outlier count based on local imatrix variance
+        int outlier_count = base_outlier_count;
+        if (qw != NULL) {
+            // Compute block importance from local imatrix data
+            float block_importance = ggml_hifi_compute_block_importance(qw, QK_K);
+            // Adjust outlier count based on block importance
+            outlier_count = ggml_hifi_compute_block_outlier_count(block_importance, base_outlier_count, model_params_b);
+        }
+
         block->outlier_count = (uint8_t)outlier_count;
         block->_padding = 0;
 
@@ -2577,6 +2593,240 @@ size_t quantize_q6_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_REST
     return nrow * row_size;
 }
 
+// =====================================================================
+// Q5_K_HIFI_RES8: Efficient Q5_K with INT8 residuals for 4B-10B models
+// Uses Q5_K base (176 bytes) instead of Q6_K (210 bytes) for better BPW
+// =====================================================================
+
+// Extended quantization function with explicit outlier count
+void quantize_row_q5_k_hifi_res8_ref_ex(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k, int outlier_count) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    // Clamp outlier count to valid range
+    if (outlier_count < 1) outlier_count = 1;
+    if (outlier_count > Q5_K_HIFI_RES8_MAX_OUTLIERS) outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS;
+
+    for (int64_t ib = 0; ib < nb; ++ib) {
+        const float * xb = x + ib * QK_K;
+        block_q5_k_hifi_res8 * block = &y[ib];
+
+        // Initialize extension fields
+        block->outlier_count = (uint8_t)outlier_count;
+        memset(block->_padding, 0, sizeof(block->_padding));
+
+        // Step 1: Find top-k outliers by magnitude
+        float mag[QK_K];
+        for (int i = 0; i < QK_K; ++i) {
+            mag[i] = fabsf(xb[i]);
+        }
+
+        // Simple selection sort for top-k (k <= 8, so O(n*k) is fine)
+        int outlier_indices[Q5_K_HIFI_RES8_MAX_OUTLIERS];
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            int max_idx = 0;
+            float max_val = mag[0];
+            for (int i = 1; i < QK_K; ++i) {
+                if (mag[i] > max_val) {
+                    max_val = mag[i];
+                    max_idx = i;
+                }
+            }
+            outlier_indices[k_idx] = max_idx;
+            mag[max_idx] = -1.0f;  // Mark as used
+        }
+
+        // Step 2: Zero outliers temporarily and quantize as Q5_K
+        float tmp[QK_K];
+        memcpy(tmp, xb, QK_K * sizeof(float));
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            tmp[outlier_indices[k_idx]] = 0.0f;
+        }
+        
+        // Quantize the Q5_K base (this fills dm, scales, qh, qs)
+        quantize_row_q5_K_ref(tmp, (block_q5_K *)block, QK_K);
+
+        // Step 3: Compute residuals from Q5_K reconstruction
+        float dequant[QK_K];
+        dequantize_row_q5_K((const block_q5_K *)block, dequant, QK_K);
+
+        float max_residual = 0.0f;
+        float residuals[Q5_K_HIFI_RES8_MAX_OUTLIERS];
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            const int idx = outlier_indices[k_idx];
+            residuals[k_idx] = xb[idx] - dequant[idx];
+            if (fabsf(residuals[k_idx]) > max_residual) {
+                max_residual = fabsf(residuals[k_idx]);
+            }
+        }
+
+        // Handle zero case
+        if (max_residual == 0.0f) max_residual = 1e-8f;
+        block->residual_scale = max_residual;
+
+        // Step 4: Store indices and INT8-quantized residuals
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            block->outlier_idx[k_idx] = (uint8_t)outlier_indices[k_idx];
+            float norm_res = residuals[k_idx] / max_residual;
+            block->residual_vals[k_idx] = (int8_t)roundf(norm_res * 127.0f);
+        }
+        // Zero-fill remaining slots
+        for (int k_idx = outlier_count; k_idx < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k_idx) {
+            block->outlier_idx[k_idx] = 0;
+            block->residual_vals[k_idx] = 0;
+        }
+    }
+}
+
+// 3-argument wrapper for ggml_from_float_t compatibility
+void quantize_row_q5_k_hifi_res8_ref(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k) {
+    quantize_row_q5_k_hifi_res8_ref_ex(x, y, k, Q5_K_HIFI_RES8_MAX_OUTLIERS);
+}
+
+// imatrix-aware quantization implementation with per-block adaptive outliers (Strategy 1)
+static void quantize_row_q5_k_hifi_res8_impl(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k, const float * GGML_RESTRICT quant_weights, int base_outlier_count) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    if (base_outlier_count < 1) base_outlier_count = 1;
+    if (base_outlier_count > Q5_K_HIFI_RES8_MAX_OUTLIERS) base_outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS;
+
+    // Get model size from HIFI context for per-block adaptation
+    float model_params_b = 4.0f;  // Default to 4B if no context
+    const ggml_hifi_quant_context * hifi_ctx = ggml_hifi_get_context();
+    if (hifi_ctx && hifi_ctx->is_active) {
+        model_params_b = hifi_ctx->model_params_b;
+    }
+
+    for (int64_t ib = 0; ib < nb; ++ib) {
+        const float * xb = x + ib * QK_K;
+        const float * qw = quant_weights ? quant_weights + ib * QK_K : NULL;
+        block_q5_k_hifi_res8 * block = &y[ib];
+
+        // Strategy 1: Compute per-block adaptive outlier count based on local imatrix variance
+        int outlier_count = base_outlier_count;
+        if (qw != NULL) {
+            // Compute block importance from local imatrix data
+            float block_importance = ggml_hifi_compute_block_importance(qw, QK_K);
+            // Adjust outlier count based on block importance
+            outlier_count = ggml_hifi_compute_block_outlier_count(block_importance, base_outlier_count, model_params_b);
+        }
+
+        block->outlier_count = (uint8_t)outlier_count;
+        memset(block->_padding, 0, sizeof(block->_padding));
+
+        // Find top-k outliers using imatrix-weighted importance
+        float importance[QK_K];
+        for (int i = 0; i < QK_K; ++i) {
+            float weight = qw ? qw[i] : 1.0f;
+            importance[i] = fabsf(xb[i]) * weight;
+        }
+
+        int outlier_indices[Q5_K_HIFI_RES8_MAX_OUTLIERS];
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            int max_idx = 0;
+            float max_val = importance[0];
+            for (int i = 1; i < QK_K; ++i) {
+                if (importance[i] > max_val) {
+                    max_val = importance[i];
+                    max_idx = i;
+                }
+            }
+            outlier_indices[k_idx] = max_idx;
+            importance[max_idx] = -1.0f;
+        }
+
+        // Zero outliers and quantize Q5_K base
+        float tmp[QK_K];
+        memcpy(tmp, xb, QK_K * sizeof(float));
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            tmp[outlier_indices[k_idx]] = 0.0f;
+        }
+        quantize_row_q5_K_ref(tmp, (block_q5_K *)block, QK_K);
+
+        // Compute residuals
+        float dequant[QK_K];
+        dequantize_row_q5_K((const block_q5_K *)block, dequant, QK_K);
+
+        float max_residual = 0.0f;
+        float residuals[Q5_K_HIFI_RES8_MAX_OUTLIERS];
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            const int idx = outlier_indices[k_idx];
+            residuals[k_idx] = xb[idx] - dequant[idx];
+            if (fabsf(residuals[k_idx]) > max_residual) {
+                max_residual = fabsf(residuals[k_idx]);
+            }
+        }
+
+        if (max_residual == 0.0f) max_residual = 1e-8f;
+        block->residual_scale = max_residual;
+
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            block->outlier_idx[k_idx] = (uint8_t)outlier_indices[k_idx];
+            float norm_res = residuals[k_idx] / max_residual;
+            block->residual_vals[k_idx] = (int8_t)roundf(norm_res * 127.0f);
+        }
+        for (int k_idx = outlier_count; k_idx < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k_idx) {
+            block->outlier_idx[k_idx] = 0;
+            block->residual_vals[k_idx] = 0;
+        }
+    }
+}
+
+// Dequantization: Q5_K base + INT8 residual corrections
+void dequantize_row_q5_k_hifi_res8(const block_q5_k_hifi_res8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int64_t ib = 0; ib < nb; ++ib) {
+        const block_q5_k_hifi_res8 * block = &x[ib];
+        float * yb = y + ib * QK_K;
+
+        // Dequantize Q5_K base
+        dequantize_row_q5_K((const block_q5_K *)block, yb, QK_K);
+
+        // Add residual corrections at outlier positions
+        const int outlier_count = block->outlier_count;
+        const float scale = block->residual_scale;
+        for (int k_idx = 0; k_idx < outlier_count; ++k_idx) {
+            const int idx = block->outlier_idx[k_idx];
+            const float residual = scale * (block->residual_vals[k_idx] / 127.0f);
+            yb[idx] += residual;
+        }
+    }
+}
+
+// Public quantization function with imatrix support
+size_t quantize_q5_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_K_HIFI_RES8, n_per_row);
+
+    // Get adaptive outlier count from HIFI context if available
+    int outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS;
+    const ggml_hifi_quant_context * hifi_ctx = ggml_hifi_get_context();
+    if (hifi_ctx && hifi_ctx->is_active) {
+        outlier_count = hifi_ctx->outlier_count;
+        if (outlier_count < 1) outlier_count = 1;
+        if (outlier_count > Q5_K_HIFI_RES8_MAX_OUTLIERS) outlier_count = Q5_K_HIFI_RES8_MAX_OUTLIERS;
+    }
+
+    if (!quant_weights) {
+        char * qrow = (char *)dst;
+        for (int64_t row = 0; row < nrow; ++row) {
+            quantize_row_q5_k_hifi_res8_ref_ex(src, (block_q5_k_hifi_res8*)qrow, n_per_row, outlier_count);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    } else {
+        char * qrow = (char *)dst;
+        for (int64_t row = 0; row < nrow; ++row) {
+            quantize_row_q5_k_hifi_res8_impl(src, (block_q5_k_hifi_res8*)qrow, n_per_row, quant_weights, outlier_count);
+            src += n_per_row;
+            qrow += row_size;
+        }
+    }
+    return nrow * row_size;
+}
+
 static void quantize_row_q4_0_impl(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
     static_assert(QK4_0 == 32, "QK4_0 must be 32");
 
@@ -6019,6 +6269,11 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
                 VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_k_hifi_res8, data, nb);
             } break;
 
+        case GGML_TYPE_Q5_K_HIFI_RES8:
+            {
+                VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_k_hifi_res8, data, nb);
+            } break;
+
         case GGML_TYPE_I8:
         case GGML_TYPE_I16:
         case GGML_TYPE_I32:
diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h
index 5eeea860fcf..bb573278ce3 100644
--- a/ggml/src/ggml-quants.h
+++ b/ggml/src/ggml-quants.h
@@ -123,6 +123,13 @@ GGML_API void quantize_row_q6_k_hifi_res8_ref_ex(const float * GGML_RESTRICT x,
 GGML_API void dequantize_row_q6_k_hifi_res8(const block_q6_k_hifi_res8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 GGML_API size_t quantize_q6_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 
+// Q5_K_HIFI_RES8: Efficient Q5_K with INT8 residuals for 4B-10B models
+// Uses Q5_K base (176 bytes) instead of Q6_K (210 bytes) for better BPW efficiency
+GGML_API void quantize_row_q5_k_hifi_res8_ref(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k);
+GGML_API void quantize_row_q5_k_hifi_res8_ref_ex(const float * GGML_RESTRICT x, block_q5_k_hifi_res8 * GGML_RESTRICT y, int64_t k, int outlier_count);
+GGML_API void dequantize_row_q5_k_hifi_res8(const block_q5_k_hifi_res8 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+GGML_API size_t quantize_q5_k_hifi_res8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index e4ffc321c2b..b02b4ee6c4d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -764,6 +764,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
         .to_float                 = (ggml_to_float_t) dequantize_row_q6_k_hifi_res8,
         .from_float_ref           = (ggml_from_float_t) quantize_row_q6_k_hifi_res8_ref,
     },
+    [GGML_TYPE_Q5_K_HIFI_RES8] = {
+        .type_name                = "Q5_K_HIFI_RES8",
+        .blck_size                = QK_K,
+        .type_size                = sizeof(block_q5_k_hifi_res8),
+        .is_quantized             = true,
+        .to_float                 = (ggml_to_float_t) dequantize_row_q5_k_hifi_res8,
+        .from_float_ref           = (ggml_from_float_t) quantize_row_q5_k_hifi_res8_ref,
+    },
     [GGML_TYPE_Q4_K] = {
         .type_name                = "q4_K",
         .blck_size                = QK_K,
@@ -7573,6 +7581,7 @@ size_t ggml_quantize_chunk(
         case GGML_TYPE_Q6_K_HIFI: result = quantize_q6_k_hifi(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC: result = quantize_q6_k_hifi_dynamic(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_Q6_K_HIFI_RES8: result = quantize_q6_k_hifi_res8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
+        case GGML_TYPE_Q5_K_HIFI_RES8: result = quantize_q5_k_hifi_res8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(ggml_fp16_t);
diff --git a/include/llama.h b/include/llama.h
index 32f4a002b88..aed19226442 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -152,8 +152,8 @@ extern "C" {
         LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
         LLAMA_FTYPE_MOSTLY_MXFP4_MOE     = 38, // except 1d tensors
-        // Legacy HIFI types (39-43) removed - consolidated into Q4_HIFI (44)
-        LLAMA_FTYPE_MOSTLY_Q4_HIFI       = 44, // Q4_K_M + 2-8 dynamic outliers + early exit (best quality/size ratio)
+        // Legacy HIFI types (39-43) removed - consolidated into Q4_K_HIFI (44)
+        LLAMA_FTYPE_MOSTLY_Q4_K_HIFI     = 44, // Q4_K_M + 2-8 dynamic outliers + early exit (best quality/size ratio)
 
         LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
     };
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index ed87421bfd9..bec7617441a 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -60,7 +60,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
         case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
-        case LLAMA_FTYPE_MOSTLY_Q4_HIFI: return "Q4_HIFI - ~4.95 bpw (Q4_K_M + INT8 residuals, compact)";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_HIFI: return "Q4_K_HIFI - ~4.95 bpw (Q4_K_M + INT8 residuals, compact)";
 
         default: return "unknown, may not work";
     }
@@ -663,8 +663,9 @@ llama_model_loader::llama_model_loader(
             case GGML_TYPE_IQ4_NL:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL;  break;
             case GGML_TYPE_IQ4_XS:  ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS;  break;
             case GGML_TYPE_IQ3_S:   ftype = LLAMA_FTYPE_MOSTLY_IQ3_S;   break;
-            case GGML_TYPE_Q6_K_HIFI_DYNAMIC: ftype = LLAMA_FTYPE_MOSTLY_Q4_HIFI; break;
-            case GGML_TYPE_Q6_K_HIFI_RES8: ftype = LLAMA_FTYPE_MOSTLY_Q4_HIFI; break;
+            case GGML_TYPE_Q6_K_HIFI_DYNAMIC: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_HIFI; break;
+            case GGML_TYPE_Q6_K_HIFI_RES8: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_HIFI; break;
+            case GGML_TYPE_Q5_K_HIFI_RES8: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_HIFI; break;
             default:
                 {
                     LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 79e699576dc..d3dc75e6599 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -48,18 +48,42 @@ static float compute_model_params_b(const llama_hparams & hparams, int64_t n_voc
     return (float)(attn_params + ffn_params + emb_params) / 1e9f;
 }
 
+// Get the appropriate HIFI type based on model size
+// Q5_K_HIFI_RES8 is more efficient for 4B-10B models (176-byte base vs 210-byte)
+// Q6_K_HIFI_RES8 is better for small models where every bit counts
+static ggml_type get_hifi_enhanced_type(float model_params_b) {
+    if (model_params_b <= 2.0f) {
+        // Small models (≤2B): Q6_K base for maximum quality
+        return GGML_TYPE_Q6_K_HIFI_RES8;
+    } else if (model_params_b <= 12.0f) {
+        // Medium models (4B-10B): Q5_K base for better BPW efficiency
+        // Q5_K + outliers ≈ Q6_K quality, but 15% smaller
+        return GGML_TYPE_Q5_K_HIFI_RES8;
+    } else {
+        // Large models (>12B): Q5_K for efficiency (diminishing returns from Q6_K)
+        return GGML_TYPE_Q5_K_HIFI_RES8;
+    }
+}
+
 // Get the percentage of attn_v layers to enhance based on model size
 // Smaller models benefit more from enhancement, larger models have diminishing returns
+// Strategy 3: For very large models (>10B), skip attn_v enhancement entirely
+//             Only token_embd and output.weight are enhanced (handled separately)
 static float get_hifi_enhancement_threshold(float model_params_b) {
     if (model_params_b <= 2.0f) {
         // Small models (≤2B): enhance 50% of layers - high ROI
         return 0.50f;
-    } else if (model_params_b <= 8.0f) {
-        // Medium models (2-8B): enhance 30% of layers - moderate ROI
+    } else if (model_params_b <= 5.0f) {
+        // Medium-small models (2-5B): enhance 30% of layers - moderate ROI
         return 0.30f;
+    } else if (model_params_b <= 10.0f) {
+        // Medium-large models (5-10B): enhance 20% of layers - lower ROI
+        return 0.20f;
     } else {
-        // Large models (>8B): enhance only 15% of layers - diminishing returns
-        return 0.15f;
+        // Very large models (>10B): Skip ALL attn_v enhancement
+        // Only token_embd and output.weight are enhanced (reduces overhead significantly)
+        // Research shows attn_v enhancement provides <0.05% PPL improvement at >10B
+        return 0.0f;
     }
 }
 
@@ -260,9 +284,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
                      ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
                 new_type = GGML_TYPE_Q5_K;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) {
-                // Q4_HIFI: Q6_K_HIFI_RES8 (Q6_K + INT8 residuals) on output - always critical
-                new_type = GGML_TYPE_Q6_K_HIFI_RES8;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) {
+                // Q4_K_HIFI: Use size-aware HIFI type on output - always critical
+                // Q5_K_HIFI_RES8 for 4B-10B, Q6_K_HIFI_RES8 for smaller models
+                const float model_params_b = compute_model_params_b(qs.model.hparams, qs.model.vocab.n_tokens());
+                new_type = get_hifi_enhanced_type(model_params_b);
             }
             else if (new_type != GGML_TYPE_Q8_0) {
                 new_type = GGML_TYPE_Q6_K;
@@ -293,9 +319,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
             else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) {
                 new_type = GGML_TYPE_Q4_K;
             }
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) {
-                // Q4_HIFI: Q6_K_HIFI_RES8 (Q6_K + INT8 residuals) on token embeddings - always critical
-                new_type = GGML_TYPE_Q6_K_HIFI_RES8;
+            else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) {
+                // Q4_K_HIFI: Use size-aware HIFI type on token embeddings - always critical
+                // Q5_K_HIFI_RES8 for 4B-10B, Q6_K_HIFI_RES8 for smaller models
+                const float model_params_b = compute_model_params_b(qs.model.hparams, qs.model.vocab.n_tokens());
+                new_type = get_hifi_enhanced_type(model_params_b);
             }
         }
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
@@ -341,18 +369,19 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
             new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) {
-            // Q4_HIFI: Model-size-aware enhancement to optimize size vs quality tradeoff
-            // - Small models (≤2B): enhance 50% of attn_v layers (high ROI)
-            // - Medium models (2-8B): enhance 30% of attn_v layers (moderate ROI)
-            // - Large models (>8B): enhance 15% of attn_v layers (diminishing returns)
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) {
+            // Q4_K_HIFI: Model-size-aware enhancement to optimize size vs quality tradeoff
+            // - Small models (≤2B): Q6_K_HIFI_RES8, enhance 50% of attn_v layers (high ROI)
+            // - Medium models (4B-10B): Q5_K_HIFI_RES8, enhance 30% of layers (optimal BPW)
+            // - Large models (>10B): Q5_K_HIFI_RES8, enhance 15% of layers (diminishing returns)
             // This reduces enhanced tensor count significantly for large models while
             // preserving quality where it matters (early layers + embeddings)
             const float model_params_b = compute_model_params_b(qs.model.hparams, qs.model.vocab.n_tokens());
             const float enhancement_threshold = get_hifi_enhancement_threshold(model_params_b);
+            const ggml_type hifi_type = get_hifi_enhanced_type(model_params_b);
             
             if (qs.i_attention_wv <= qs.n_attention_wv * enhancement_threshold) {
-                new_type = GGML_TYPE_Q6_K_HIFI_RES8;
+                new_type = hifi_type;  // Use size-appropriate HIFI type
             } else if (use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) {
                 new_type = GGML_TYPE_Q6_K;  // Follow Q4_K_M behavior for critical late layers
             }
@@ -418,8 +447,8 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
             new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) {
-            // Q4_HIFI follows Q4_K_M behavior for ffn_down
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) {
+            // Q4_K_HIFI follows Q4_K_M behavior for ffn_down
             if (arch == LLM_ARCH_FALCON) {
                 new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
                            use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
@@ -466,7 +495,7 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
         if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
             new_type = GGML_TYPE_Q4_K;
         }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) new_type = GGML_TYPE_Q5_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) new_type = GGML_TYPE_Q5_K;
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
     }
     else if (name.find("ffn_gate") != std::string::npos) {
@@ -652,7 +681,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         case LLAMA_FTYPE_MOSTLY_IQ4_XS:  default_type = GGML_TYPE_IQ4_XS;  break;
         case LLAMA_FTYPE_MOSTLY_IQ3_S:   default_type = GGML_TYPE_IQ3_S;   break;
         case LLAMA_FTYPE_MOSTLY_IQ3_M:   default_type = GGML_TYPE_IQ3_S;   break;
-        case LLAMA_FTYPE_MOSTLY_Q4_HIFI: default_type = GGML_TYPE_Q4_K; break; // Q4_K_M + dynamic outliers + early exit
+        case LLAMA_FTYPE_MOSTLY_Q4_K_HIFI: default_type = GGML_TYPE_Q4_K; break; // Q4_K_M + dynamic outliers + early exit
 
         default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
     }
@@ -723,8 +752,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     gguf_set_val_u32(ctx_out.get(), "general.file_type", ftype); // TODO: use LLM_KV
 
     // Set quantization type string for Hugging Face model card display
-    if (ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) {
-        gguf_set_val_str(ctx_out.get(), "general.quantization_type", "Q4_HIFI");
+    if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) {
+        gguf_set_val_str(ctx_out.get(), "general.quantization_type", "Q4_K_HIFI");
     }
 
     // Remove split metadata
@@ -1070,7 +1099,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             ggml_hifi_quant_context hifi_ctx = {};
             const ggml_hifi_quant_context * hifi_ctx_ptr = nullptr;
 
-            if (new_type == GGML_TYPE_Q6_K_HIFI_RES8 && ftype == LLAMA_FTYPE_MOSTLY_Q4_HIFI) {
+            // Handle both Q6_K_HIFI_RES8 and Q5_K_HIFI_RES8 HIFI types
+            const bool is_hifi_type = (new_type == GGML_TYPE_Q6_K_HIFI_RES8 || new_type == GGML_TYPE_Q5_K_HIFI_RES8);
+            if (is_hifi_type && ftype == LLAMA_FTYPE_MOSTLY_Q4_K_HIFI) {
                 // Extract layer index from tensor name (e.g., "blk.5.attn_v.weight" -> 5)
                 int layer_idx = -1;
                 if (sscanf(name.c_str(), "blk.%d.", &layer_idx) != 1) {
@@ -1105,14 +1136,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 }
 
                 // Compute adaptive outlier count
+                // Use the appropriate max outliers constant based on type
+                const int max_outliers = (new_type == GGML_TYPE_Q5_K_HIFI_RES8) 
+                    ? Q5_K_HIFI_RES8_MAX_OUTLIERS : Q6_K_HIFI_RES8_MAX_OUTLIERS;
                 int outlier_count;
                 if (layer_idx < 0) {
                     // Critical non-layer tensors (token_embd, output.weight): max outliers
-                    outlier_count = Q6_K_HIFI_RES8_MAX_OUTLIERS;
+                    outlier_count = max_outliers;
                 } else {
                     outlier_count = ggml_hifi_compute_outlier_count(
                         layer_idx, n_layers, layer_importance, model_params_b
                     );
+                    // Clamp to the type's max outliers
+                    if (outlier_count > max_outliers) outlier_count = max_outliers;
                 }
 
                 // Set up context
@@ -1125,8 +1161,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                 hifi_ctx_ptr = &hifi_ctx;
 
                 // Log adaptive outlier allocation (INFO level for visibility)
-                LLAMA_LOG_INFO("(HIFI: model=%.1fB layer=%d/%d imp=%.2f outliers=%d) ",
-                    model_params_b, layer_idx, n_layers, layer_importance, outlier_count);
+                const char * type_name = (new_type == GGML_TYPE_Q5_K_HIFI_RES8) ? "Q5_K_HIFI" : "Q6_K_HIFI";
+                LLAMA_LOG_INFO("(%s: model=%.1fB layer=%d/%d imp=%.2f outliers=%d) ",
+                    type_name, model_params_b, layer_idx, n_layers, layer_importance, outlier_count);
             }
 
             for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index 034cc2f41a4..f4d775c070c 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -43,7 +43,7 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
     { "Q3_K_S",   LLAMA_FTYPE_MOSTLY_Q3_K_S,   " 3.41G, +1.6321 ppl @ Llama-3-8B",  },
     { "Q3_K_M",   LLAMA_FTYPE_MOSTLY_Q3_K_M,   " 3.74G, +0.6569 ppl @ Llama-3-8B",  },
     { "Q3_K_L",   LLAMA_FTYPE_MOSTLY_Q3_K_L,   " 4.03G, +0.5562 ppl @ Llama-3-8B",  },
-    { "Q4_HIFI",  LLAMA_FTYPE_MOSTLY_Q4_HIFI,  " ~4.95 bpw Q4_K_M + INT8 residuals (best quality-per-byte)", },
+    { "Q4_K_HIFI",  LLAMA_FTYPE_MOSTLY_Q4_K_HIFI,  " ~4.95 bpw Q4_K_M + INT8 residuals (best quality-per-byte)", },
     { "IQ4_NL",   LLAMA_FTYPE_MOSTLY_IQ4_NL,   " 4.50 bpw non-linear quantization", },
     { "IQ4_XS",   LLAMA_FTYPE_MOSTLY_IQ4_XS,   " 4.25 bpw non-linear quantization", },
     { "Q4_K",     LLAMA_FTYPE_MOSTLY_Q4_K_M,   "alias for Q4_K_M",                  },