From 904640a88d24bc9c60c5b0e4fc65aeda250e38e8 Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Fri, 27 Feb 2026 15:13:26 +1300
Subject: [PATCH 1/8] First cut at speed increases

---
 ggml/src/ggml-cuda/mmq.cu | 67 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 23b852e8f19..abcdee3d20d 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -3,6 +3,48 @@
 #include "quantize.cuh"
 #include "mmid.cuh"
 
+// Copy Q5_K base (176 bytes) from each Q5_K_HIFI_RES8 block (196 bytes) for MMQ path
+static __global__ void ggml_cuda_compact_q5_k_hifi_res8_to_q5_k(
+    const void * __restrict__ src, void * __restrict__ dst, int64_t n_blocks) {
+    const int64_t i = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n_blocks) return;
+    const char * s = (const char *)src + i * sizeof(block_q5_k_hifi_res8);
+    char * d = (char *)dst + i * sizeof(block_q5_K);
+    for (int j = 0; j < (int)sizeof(block_q5_K); ++j) {
+        d[j] = s[j];
+    }
+}
+
+// Add Q5_K_HIFI_RES8 INT8 residual corrections to MMQ output using F32 activations
+static __global__ void ggml_cuda_add_q5_k_hifi_res8_residuals(
+    const block_q5_k_hifi_res8 * __restrict__ x,
+    const float * __restrict__ src1, float * __restrict__ dst,
+    int64_t nrows_x, int64_t ncols_x, int64_t ncols_dst,
+    int64_t stride_row_x, int64_t stride_src1, int64_t stride_dst) {
+    const int64_t linear = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (linear >= nrows_x * ncols_dst) return;
+    const int64_t row = linear / ncols_dst;
+    const int64_t batch = linear % ncols_dst;
+    const int64_t n_blocks = ncols_x / QK_K;
+    float sum = 0.0f;
+    for (int64_t b = 0; b < n_blocks; ++b) {
+        const block_q5_k_hifi_res8 * block = x + row * stride_row_x + b;
+        const int n_out = (block->outlier_count & 0x7F);
+        if (n_out == 0) continue;
+        const uint8_t e4m3 = block->residual_scale_e4m3;
+        if (e4m3 == 0) continue;
+        const int sign = (e4m3 >> 7) & 0x01;
+        const int exp = (e4m3 >> 3) & 0x0F;
+        const int mantissa = e4m3 & 0x07;
+        const float res_scale = (1.0f + (float)mantissa * 0.125f) * exp2f((float)exp - 7.0f) * (sign ? -1.0f : 1.0f) * (1.0f / 127.0f);
+        for (int k = 0; k < n_out && k < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k) {
+            const int col = b * QK_K + block->outlier_idx[k];
+            sum += res_scale * (float)block->residual_vals[k] * src1[batch * stride_src1 + col];
+        }
+    }
+    dst[batch * stride_dst + row] += sum;
+}
+
 static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
     switch (args.type_x) {
         case GGML_TYPE_Q4_0:
@@ -147,6 +189,30 @@ void ggml_cuda_mul_mat_q(
                                 ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
         const int64_t s13 = ne12*s12;
 
+        if (src0->type == GGML_TYPE_Q5_K_HIFI_RES8) {
+            const int64_t n_blocks = (ne00 / QK_K) * ne01;
+            ggml_cuda_pool_alloc<char> q5_k_compact(ctx.pool(), n_blocks * sizeof(block_q5_K));
+            const int nth = 256;
+            ggml_cuda_compact_q5_k_hifi_res8_to_q5_k<<<(n_blocks + nth - 1) / nth, nth, 0, stream>>>
+                (src0_d, q5_k_compact.get(), n_blocks);
+            CUDA_CHECK(cudaGetLastError());
+            const mmq_args args_q5 = {
+                q5_k_compact.get(), GGML_TYPE_Q5_K, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
+                ne00, ne01, ne1, s01, ne11, s1,
+                ne02, ne12, s02, s12, s2,
+                ne03, ne13, s03, s13, s3,
+                use_stream_k, ne1};
+            ggml_cuda_mul_mat_q_switch_type(ctx, args_q5, stream);
+            const int64_t stride_src1 = src1->nb[1] / (int64_t)sizeof(float);
+            const int64_t stride_dst = dst->nb[1] / (int64_t)sizeof(float);
+            const int64_t n_residual = ne01 * ne1;
+            ggml_cuda_add_q5_k_hifi_res8_residuals<<<(n_residual + 255) / 256, 256, 0, stream>>>
+                ((const block_q5_k_hifi_res8 *)src0_d, (const float *)src1_d, dst_d,
+                 ne01, ne00, ne1, s01, stride_src1, stride_dst);
+            CUDA_CHECK(cudaGetLastError());
+            return;
+        }
+
         const mmq_args args = {
             src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
             ne00, ne01, ne1, s01, ne11, s1,
@@ -278,6 +344,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
         // Q3_K_HIFI excluded - uses MMVQ/dequant path instead
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q5_K_HIFI_RES8:  // Use Q5_K MMQ path (compact copy + residual kernel)
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:

From 5e957b905315c1ec67a8b402f8df21c45a1d52bc Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Sat, 28 Feb 2026 12:01:45 +1300
Subject: [PATCH 2/8] First cut at speed increases

---
 ggml/src/ggml-cuda/mmq.cu | 67 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 23b852e8f19..abcdee3d20d 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -3,6 +3,48 @@
 #include "quantize.cuh"
 #include "mmid.cuh"
 
+// Copy Q5_K base (176 bytes) from each Q5_K_HIFI_RES8 block (196 bytes) for MMQ path
+static __global__ void ggml_cuda_compact_q5_k_hifi_res8_to_q5_k(
+    const void * __restrict__ src, void * __restrict__ dst, int64_t n_blocks) {
+    const int64_t i = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= n_blocks) return;
+    const char * s = (const char *)src + i * sizeof(block_q5_k_hifi_res8);
+    char * d = (char *)dst + i * sizeof(block_q5_K);
+    for (int j = 0; j < (int)sizeof(block_q5_K); ++j) {
+        d[j] = s[j];
+    }
+}
+
+// Add Q5_K_HIFI_RES8 INT8 residual corrections to MMQ output using F32 activations
+static __global__ void ggml_cuda_add_q5_k_hifi_res8_residuals(
+    const block_q5_k_hifi_res8 * __restrict__ x,
+    const float * __restrict__ src1, float * __restrict__ dst,
+    int64_t nrows_x, int64_t ncols_x, int64_t ncols_dst,
+    int64_t stride_row_x, int64_t stride_src1, int64_t stride_dst) {
+    const int64_t linear = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (linear >= nrows_x * ncols_dst) return;
+    const int64_t row = linear / ncols_dst;
+    const int64_t batch = linear % ncols_dst;
+    const int64_t n_blocks = ncols_x / QK_K;
+    float sum = 0.0f;
+    for (int64_t b = 0; b < n_blocks; ++b) {
+        const block_q5_k_hifi_res8 * block = x + row * stride_row_x + b;
+        const int n_out = (block->outlier_count & 0x7F);
+        if (n_out == 0) continue;
+        const uint8_t e4m3 = block->residual_scale_e4m3;
+        if (e4m3 == 0) continue;
+        const int sign = (e4m3 >> 7) & 0x01;
+        const int exp = (e4m3 >> 3) & 0x0F;
+        const int mantissa = e4m3 & 0x07;
+        const float res_scale = (1.0f + (float)mantissa * 0.125f) * exp2f((float)exp - 7.0f) * (sign ? -1.0f : 1.0f) * (1.0f / 127.0f);
+        for (int k = 0; k < n_out && k < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k) {
+            const int col = b * QK_K + block->outlier_idx[k];
+            sum += res_scale * (float)block->residual_vals[k] * src1[batch * stride_src1 + col];
+        }
+    }
+    dst[batch * stride_dst + row] += sum;
+}
+
 static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
     switch (args.type_x) {
         case GGML_TYPE_Q4_0:
@@ -147,6 +189,30 @@ void ggml_cuda_mul_mat_q(
                                 ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
         const int64_t s13 = ne12*s12;
 
+        if (src0->type == GGML_TYPE_Q5_K_HIFI_RES8) {
+            const int64_t n_blocks = (ne00 / QK_K) * ne01;
+            ggml_cuda_pool_alloc<char> q5_k_compact(ctx.pool(), n_blocks * sizeof(block_q5_K));
+            const int nth = 256;
+            ggml_cuda_compact_q5_k_hifi_res8_to_q5_k<<<(n_blocks + nth - 1) / nth, nth, 0, stream>>>
+                (src0_d, q5_k_compact.get(), n_blocks);
+            CUDA_CHECK(cudaGetLastError());
+            const mmq_args args_q5 = {
+                q5_k_compact.get(), GGML_TYPE_Q5_K, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
+                ne00, ne01, ne1, s01, ne11, s1,
+                ne02, ne12, s02, s12, s2,
+                ne03, ne13, s03, s13, s3,
+                use_stream_k, ne1};
+            ggml_cuda_mul_mat_q_switch_type(ctx, args_q5, stream);
+            const int64_t stride_src1 = src1->nb[1] / (int64_t)sizeof(float);
+            const int64_t stride_dst = dst->nb[1] / (int64_t)sizeof(float);
+            const int64_t n_residual = ne01 * ne1;
+            ggml_cuda_add_q5_k_hifi_res8_residuals<<<(n_residual + 255) / 256, 256, 0, stream>>>
+                ((const block_q5_k_hifi_res8 *)src0_d, (const float *)src1_d, dst_d,
+                 ne01, ne00, ne1, s01, stride_src1, stride_dst);
+            CUDA_CHECK(cudaGetLastError());
+            return;
+        }
+
         const mmq_args args = {
             src0_d, src0->type, (const int *) src1_q8_1.ptr, nullptr, nullptr, dst_d,
             ne00, ne01, ne1, s01, ne11, s1,
@@ -278,6 +344,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
         // Q3_K_HIFI excluded - uses MMVQ/dequant path instead
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q5_K_HIFI_RES8:  // Use Q5_K MMQ path (compact copy + residual kernel)
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_IQ2_XXS:
         case GGML_TYPE_IQ2_XS:

From 484d9540684765028b956ea10675924aeddc4990 Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Sat, 28 Feb 2026 12:02:22 +1300
Subject: [PATCH 3/8] Improved speed attempt

---
 ggml/src/ggml-cuda/mmq.cu | 87 +++++++++++++++++++++++++++------------
 1 file changed, 60 insertions(+), 27 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index abcdee3d20d..cf4fc5ee6df 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -3,46 +3,76 @@
 #include "quantize.cuh"
 #include "mmid.cuh"
 
-// Copy Q5_K base (176 bytes) from each Q5_K_HIFI_RES8 block (196 bytes) for MMQ path
+// Copy Q5_K base (176 bytes) from each Q5_K_HIFI_RES8 block (196 bytes) for MMQ path.
+// Uses vectorized 4-byte loads: 176/4=44 words, 196/4=49 words (both divisible by 4 so every
+// block-start is uint32_t-aligned regardless of block index).
+static_assert(sizeof(block_q5_K)           % sizeof(uint32_t) == 0, "Q5_K size not a multiple of 4");
+static_assert(sizeof(block_q5_k_hifi_res8) % sizeof(uint32_t) == 0, "Q5_K_HIFI_RES8 size not a multiple of 4");
 static __global__ void ggml_cuda_compact_q5_k_hifi_res8_to_q5_k(
     const void * __restrict__ src, void * __restrict__ dst, int64_t n_blocks) {
     const int64_t i = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= n_blocks) return;
-    const char * s = (const char *)src + i * sizeof(block_q5_k_hifi_res8);
-    char * d = (char *)dst + i * sizeof(block_q5_K);
-    for (int j = 0; j < (int)sizeof(block_q5_K); ++j) {
+    const uint32_t * s = (const uint32_t *)((const char *)src + i * sizeof(block_q5_k_hifi_res8));
+    uint32_t       * d = (uint32_t       *)((char       *)dst + i * sizeof(block_q5_K));
+    #pragma unroll
+    for (int j = 0; j < (int)(sizeof(block_q5_K) / sizeof(uint32_t)); ++j) {
         d[j] = s[j];
     }
 }
 
-// Add Q5_K_HIFI_RES8 INT8 residual corrections to MMQ output using F32 activations
+// Add Q5_K_HIFI_RES8 INT8 residual corrections to MMQ output using F32 activations.
+// Parallelised at the (row, block) level rather than (row, batch):
+//   - 92% of threads hit the early-exit (outlier_count==0) before touching src1 or dst.
+//   - The 8% of threads that do have outliers loop over all batch slots and atomicAdd
+//     their contribution.  Contention is negligible (~1 writer per output cell on average).
 static __global__ void ggml_cuda_add_q5_k_hifi_res8_residuals(
     const block_q5_k_hifi_res8 * __restrict__ x,
     const float * __restrict__ src1, float * __restrict__ dst,
     int64_t nrows_x, int64_t ncols_x, int64_t ncols_dst,
     int64_t stride_row_x, int64_t stride_src1, int64_t stride_dst) {
-    const int64_t linear = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
-    if (linear >= nrows_x * ncols_dst) return;
-    const int64_t row = linear / ncols_dst;
-    const int64_t batch = linear % ncols_dst;
+
     const int64_t n_blocks = ncols_x / QK_K;
-    float sum = 0.0f;
-    for (int64_t b = 0; b < n_blocks; ++b) {
-        const block_q5_k_hifi_res8 * block = x + row * stride_row_x + b;
-        const int n_out = (block->outlier_count & 0x7F);
-        if (n_out == 0) continue;
-        const uint8_t e4m3 = block->residual_scale_e4m3;
-        if (e4m3 == 0) continue;
-        const int sign = (e4m3 >> 7) & 0x01;
-        const int exp = (e4m3 >> 3) & 0x0F;
-        const int mantissa = e4m3 & 0x07;
-        const float res_scale = (1.0f + (float)mantissa * 0.125f) * exp2f((float)exp - 7.0f) * (sign ? -1.0f : 1.0f) * (1.0f / 127.0f);
-        for (int k = 0; k < n_out && k < Q5_K_HIFI_RES8_MAX_OUTLIERS; ++k) {
-            const int col = b * QK_K + block->outlier_idx[k];
-            sum += res_scale * (float)block->residual_vals[k] * src1[batch * stride_src1 + col];
+    const int64_t rb = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    if (rb >= nrows_x * n_blocks) return;
+
+    const int64_t row = rb / n_blocks;
+    const int64_t b   = rb % n_blocks;
+
+    const block_q5_k_hifi_res8 * block = x + row * stride_row_x + b;
+    const int n_out = (block->outlier_count & 0x7F);
+    if (n_out == 0) return;           // fast path: ~92% of blocks exit here
+
+    const uint8_t e4m3 = block->residual_scale_e4m3;
+    if (e4m3 == 0) return;
+
+    // Decode E4M3 FP8 residual scale once, in registers
+    const int   sign     = (e4m3 >> 7) & 0x01;
+    const int   exp      = (e4m3 >> 3) & 0x0F;
+    const int   mantissa =  e4m3       & 0x07;
+    const float res_scale = (1.0f + (float)mantissa * 0.125f)
+                          * exp2f((float)exp - 7.0f)
+                          * (sign ? -1.0f : 1.0f)
+                          * (1.0f / 127.0f);
+
+    // Cache per-outlier column indices and scaled residual values in registers
+    // so the inner batch loop only reads src1 (no repeated block struct accesses).
+    const int n_valid = (n_out < Q5_K_HIFI_RES8_MAX_OUTLIERS) ? n_out : Q5_K_HIFI_RES8_MAX_OUTLIERS;
+    int   cols [Q5_K_HIFI_RES8_MAX_OUTLIERS];
+    float rvals[Q5_K_HIFI_RES8_MAX_OUTLIERS];
+    for (int k = 0; k < n_valid; ++k) {
+        cols [k] = (int)b * QK_K + block->outlier_idx[k];
+        rvals[k] = res_scale * (float)block->residual_vals[k];
+    }
+
+    // Accumulate residual dot-products over all batch slots and atomicAdd to dst.
+    // Low contention: at most ~1.3 enhanced blocks per row on average.
+    for (int64_t batch = 0; batch < ncols_dst; ++batch) {
+        float sum = 0.0f;
+        for (int k = 0; k < n_valid; ++k) {
+            sum += rvals[k] * src1[batch * stride_src1 + cols[k]];
         }
+        atomicAdd(&dst[batch * stride_dst + row], sum);
     }
-    dst[batch * stride_dst + row] += sum;
 }
 
 static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
@@ -204,9 +234,12 @@ void ggml_cuda_mul_mat_q(
                 use_stream_k, ne1};
             ggml_cuda_mul_mat_q_switch_type(ctx, args_q5, stream);
             const int64_t stride_src1 = src1->nb[1] / (int64_t)sizeof(float);
-            const int64_t stride_dst = dst->nb[1] / (int64_t)sizeof(float);
-            const int64_t n_residual = ne01 * ne1;
-            ggml_cuda_add_q5_k_hifi_res8_residuals<<<(n_residual + 255) / 256, 256, 0, stream>>>
+            const int64_t stride_dst  = dst->nb[1]  / (int64_t)sizeof(float);
+            // Launch one thread per (weight-row, block) pair.
+            // ~92% of threads exit immediately (no outliers); only ~8% touch src1/dst.
+            const int64_t n_blocks_per_row = ne00 / QK_K;
+            const int64_t n_rb = ne01 * n_blocks_per_row;
+            ggml_cuda_add_q5_k_hifi_res8_residuals<<<(n_rb + 255) / 256, 256, 0, stream>>>
                 ((const block_q5_k_hifi_res8 *)src0_d, (const float *)src1_d, dst_d,
                  ne01, ne00, ne1, s01, stride_src1, stride_dst);
             CUDA_CHECK(cudaGetLastError());

From 423f87bb0979c70e45529534df14460b8fae7213 Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Sat, 28 Feb 2026 16:25:48 +1300
Subject: [PATCH 4/8] Better error messages

---
 tools/quantize/quantize.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp
index d2a5969972c..8997e661379 100644
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -220,6 +220,15 @@ static int load_legacy_imatrix(const std::string & imatrix_file, std::vector<std
 
 static int load_imatrix(const std::string & imatrix_file, std::vector<std::string> & imatrix_datasets, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
 
+    if (!std::filesystem::exists(imatrix_file)) {
+        fprintf(stderr, "%s: imatrix file '%s' not found\n", __func__, imatrix_file.c_str());
+        exit(1);
+    }
+    if (!std::filesystem::is_regular_file(imatrix_file)) {
+        fprintf(stderr, "%s: imatrix path '%s' is not a regular file\n", __func__, imatrix_file.c_str());
+        exit(1);
+    }
+    
     struct ggml_context * ctx = nullptr;
     struct gguf_init_params meta_gguf_params = {
         /* .no_alloc = */ false, // the data is needed

From 6856a7fcc619b16c52d1601f300d3c592cebdfb6 Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Sun, 1 Mar 2026 15:46:00 +1300
Subject: [PATCH 5/8] Metal kernal improvements

---
 ggml/src/ggml-metal/ggml-metal-device.cpp |  12 +-
 ggml/src/ggml-metal/ggml-metal.metal      | 155 ++++++++++++++++++++++
 2 files changed, 162 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index 9f77d7adfcc..54df6761490 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -148,7 +148,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows(ggml_me
     char base[256];
     char name[256];
 
-    snprintf(base, 256, "kernel_get_rows_%s", ggml_type_name(tsrc));
+    // Use ggml_metal_type_name_for_kernel for HIFI types so the kernel name matches
+    // the dedicated kernels registered in ggml-metal.metal (e.g. "q5_K_hifi_res8")
+    snprintf(base, 256, "kernel_get_rows_%s", ggml_metal_type_name_for_kernel(tsrc));
     snprintf(name, 256, "%s", base);
 
     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
@@ -532,9 +534,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv(ggml_metal_
     return res;
 }
 
-// Map HIFI types to their base types for kernel name generation
-// Since HIFI types are based on Q6_K/Q5_K, they can use the same kernels
-// Q3_K_HIFI has its own dedicated kernel, so it needs its own name
+// Map HIFI types to their kernel name counterparts
+// Q3_K_HIFI, Q4_K_HIFI, Q5_K_HIFI_RES8 have dedicated kernels with correct block strides
+// Q6_K HIFI variants reuse Q6_K kernels (TODO: fix stride mismatch for Q6_K HIFI types)
 static const char * ggml_metal_type_name_for_kernel(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q3_K_HIFI:
@@ -546,7 +548,7 @@ static const char * ggml_metal_type_name_for_kernel(ggml_type type) {
         case GGML_TYPE_Q6_K_HIFI_RES8:
             return "q6_K";
         case GGML_TYPE_Q5_K_HIFI_RES8:
-            return "q5_K";
+            return "q5_K_hifi_res8";
         default:
             return ggml_type_name(type);
     }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 4072aa53525..371b6000d45 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -948,6 +948,14 @@ void dequantize_q4_k_hifi(device const block_q4_k_hifi * xb, short il, thread ty
     }
 }
 
+// Q5_K_HIFI_RES8: Q5_K layout + 20-byte INT8 residual extension (196 bytes total)
+// The base Q5_K fields (d, dmin, scales, qh, qs) are at identical byte offsets.
+// Residual corrections are not applied in the Metal path (only in CPU path).
+template <typename type4x4>
+void dequantize_q5_k_hifi_res8(device const block_q5_k_hifi_res8 * xb, short il, thread type4x4 & reg) {
+    dequantize_q5_K((device const block_q5_K *)xb, il, reg);
+}
+
 enum ggml_sort_order {
     GGML_SORT_ORDER_ASC,
     GGML_SORT_ORDER_DESC,
@@ -3810,6 +3818,13 @@ template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_3")]] kernel mul_mv_ext_q4x4
 template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q5_K, 256, dequantize_q5_K>;
 template [[host_name("kernel_mul_mv_ext_q5_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q5_K, 256, dequantize_q5_K>;
 
+typedef decltype(kernel_mul_mv_ext_q4x4_f32_disp<2, block_q5_k_hifi_res8, 256, dequantize_q5_k_hifi_res8>) mul_mv_ext_q5_K_hifi_res8_f32_t;
+
+template [[host_name("kernel_mul_mv_ext_q5_K_hifi_res8_f32_r1_2")]] kernel mul_mv_ext_q5_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q5_k_hifi_res8, 256, dequantize_q5_k_hifi_res8>;
+template [[host_name("kernel_mul_mv_ext_q5_K_hifi_res8_f32_r1_3")]] kernel mul_mv_ext_q5_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q5_k_hifi_res8, 256, dequantize_q5_k_hifi_res8>;
+template [[host_name("kernel_mul_mv_ext_q5_K_hifi_res8_f32_r1_4")]] kernel mul_mv_ext_q5_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q5_k_hifi_res8, 256, dequantize_q5_k_hifi_res8>;
+template [[host_name("kernel_mul_mv_ext_q5_K_hifi_res8_f32_r1_5")]] kernel mul_mv_ext_q5_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q5_k_hifi_res8, 256, dequantize_q5_k_hifi_res8>;
+
 template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_2")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_K, 256, dequantize_q6_K>;
 template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q6_K, 256, dequantize_q6_K>;
 template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
@@ -7867,6 +7882,140 @@ kernel void kernel_mul_mv_q5_K_f32(
     kernel_mul_mv_q5_K_f32_impl<N_R0_Q5_K, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+// Q5_K_HIFI_RES8: identical to Q5_K mul_mv but uses block_q5_k_hifi_res8 pointer (196-byte stride)
+// The base Q5_K fields are at identical byte offsets; HIFI residual extension is ignored here.
+template<int nr0, typename args_t>
+void kernel_mul_mv_q5_K_hifi_res8_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const short NSG = FC_mul_mv_nsg;
+
+    const int nb = args.ne00/QK_K;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * NSG + sgitg) * nr0;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    // KEY FIX: use correct 196-byte struct stride instead of block_q5_K (176 bytes)
+    device const block_q5_k_hifi_res8 * x = (device const block_q5_k_hifi_res8 *) (src0 + offset0);
+    device const float               * yy = (device const float                *) (src1 + offset1);
+
+    float sumf[nr0]={0.f};
+
+    float yl[16], yh[16];
+
+    constexpr uint16_t kmask1 = 0x3f3f;
+    constexpr uint16_t kmask2 = 0x0f0f;
+    constexpr uint16_t kmask3 = 0xc0c0;
+
+    const short tid = tiisg/4;
+    const short ix  = tiisg%4;
+    const short iq  = tid/4;
+    const short ir  = tid%4;
+
+    const short l0 = 8*ir;
+    const short q_offset = 32*iq + l0;
+    const short y_offset = 64*iq + l0;
+
+    const uint8_t hm1 = 1u << (2*iq);
+    const uint8_t hm2 = hm1 << 1;
+    const uint8_t hm3 = hm1 << 4;
+    const uint8_t hm4 = hm2 << 4;
+
+    uint16_t sc16[4];
+    thread const uint8_t * sc8 = (thread const uint8_t *)sc16;
+
+    device const float * y1 = yy + ix*QK_K + y_offset;
+
+    for (int i = ix; i < nb; i += 4) {
+        device const uint8_t * q1 = x[i].qs + q_offset;
+        device const uint8_t * qh = x[i].qh + l0;
+        device const half * dh = &x[i].d;
+        device const uint16_t * a = (device const uint16_t *)x[i].scales + iq;
+
+        device const float * y2 = y1 + 128;
+        float4 sumy = {0.f, 0.f, 0.f, 0.f};
+        for (short l = 0; l < 8; ++l) {
+            yl[l+0] = y1[l+ 0]; sumy[0] += yl[l+0];
+            yl[l+8] = y1[l+32]; sumy[1] += yl[l+8];
+            yh[l+0] = y2[l+ 0]; sumy[2] += yh[l+0];
+            yh[l+8] = y2[l+32]; sumy[3] += yh[l+8];
+        }
+
+        for (short row = 0; row < nr0; ++row) {
+            device const uint8_t * q2 = q1 + 64;
+
+            sc16[0] = a[0] & kmask1;
+            sc16[1] = a[2] & kmask1;
+            sc16[2] = ((a[4] >> 0) & kmask2) | ((a[0] & kmask3) >> 2);
+            sc16[3] = ((a[4] >> 4) & kmask2) | ((a[2] & kmask3) >> 2);
+
+            float4 acc1 = {0.f};
+            float4 acc2 = {0.f};
+            FOR_UNROLL (short l = 0; l < 8; ++l) {
+                uint8_t h = qh[l];
+                acc1[0] += yl[l+0] * (q1[l] & 0x0F);
+                acc1[1] += yl[l+8] * (q1[l] & 0xF0);
+                acc1[2] += yh[l+0] * (q2[l] & 0x0F);
+                acc1[3] += yh[l+8] * (q2[l] & 0xF0);
+                acc2[0] += h & hm1 ? yl[l+0] : 0.f;
+                acc2[1] += h & hm2 ? yl[l+8] : 0.f;
+                acc2[2] += h & hm3 ? yh[l+0] : 0.f;
+                acc2[3] += h & hm4 ? yh[l+8] : 0.f;
+            }
+
+            sumf[row] += dh[0] * (sc8[0] * (acc1[0]      + 16.f*acc2[0]) +
+                                  sc8[1] * (acc1[1]/16.f + 16.f*acc2[1]) +
+                                  sc8[4] * (acc1[2]      + 16.f*acc2[2]) +
+                                  sc8[5] * (acc1[3]/16.f + 16.f*acc2[3])) -
+                         dh[1] * (sumy[0] * sc8[2] + sumy[1] * sc8[3] + sumy[2] * sc8[6] + sumy[3] * sc8[7]);
+
+            q1 += args.nb01;
+            qh += args.nb01;
+            dh += args.nb01/2;
+            a  += args.nb01/2;
+        }
+
+        y1 += 4 * QK_K;
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
+        const float tot = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = tot;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_q5_K_hifi_res8_f32")]]
+kernel void kernel_mul_mv_q5_K_hifi_res8_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_q5_K_hifi_res8_f32_impl<N_R0_Q5_K, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
 template<int nr0, typename args_t>
 void kernel_mul_mv_q6_K_f32_impl(
         args_t args,
@@ -9892,6 +10041,7 @@ template [[host_name("kernel_get_rows_q3_k_hifi")]] kernel get_rows_q_t kernel_g
 template [[host_name("kernel_get_rows_q4_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q4_K,    QK_NL, dequantize_q4_K>;
 template [[host_name("kernel_get_rows_q4_k_hifi")]] kernel get_rows_q_t kernel_get_rows_q<block_q4_k_hifi, QK_NL, dequantize_q4_k_hifi>;
 template [[host_name("kernel_get_rows_q5_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_get_rows_q5_k_hifi_res8")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8>;
 template [[host_name("kernel_get_rows_q6_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q6_K,    QK_NL, dequantize_q6_K>;
 template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_q_t kernel_get_rows_q<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
@@ -9956,6 +10106,7 @@ template [[host_name("kernel_mul_mm_q3_k_hifi_f32")]] kernel mul_mm_t kernel_mul
 template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_q4_k_hifi_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_k_hifi, QK_NL, dequantize_q4_k_hifi, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  float, float2x4>;
+template [[host_name("kernel_mul_mm_q5_K_hifi_res8_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  float, float2x4>;
@@ -9981,6 +10132,7 @@ template [[host_name("kernel_mul_mm_q3_k_hifi_f16")]] kernel mul_mm_t kernel_mul
 template [[host_name("kernel_mul_mm_q4_K_f16")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_q4_k_hifi_f16")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_k_hifi, QK_NL, dequantize_q4_k_hifi, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_q5_K_f16")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  half, half2x4>;
+template [[host_name("kernel_mul_mm_q5_K_hifi_res8_f16")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_q6_K_f16")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_iq2_xxs_f16")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_iq2_xs_f16")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  half, half2x4>;
@@ -10015,6 +10167,7 @@ template [[host_name("kernel_mul_mm_id_q3_k_hifi_f32")]] kernel mul_mm_id kernel
 template [[host_name("kernel_mul_mm_id_q4_K_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q4_k_hifi_f32")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_k_hifi, QK_NL, dequantize_q4_k_hifi, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  float, float2x4>;
+template [[host_name("kernel_mul_mm_id_q5_K_hifi_res8_f32")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  float, float2x4>;
@@ -10040,6 +10193,7 @@ template [[host_name("kernel_mul_mm_id_q3_k_hifi_f16")]] kernel mul_mm_id kernel
 template [[host_name("kernel_mul_mm_id_q4_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q4_k_hifi_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q4_k_hifi, QK_NL, dequantize_q4_k_hifi, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q5_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  half, half2x4>;
+template [[host_name("kernel_mul_mm_id_q5_K_hifi_res8_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q6_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  half, half2x4>;
@@ -10197,6 +10351,7 @@ template [[host_name("kernel_mul_mv_id_q3_k_hifi_f32")]] kernel kernel_mul_mv_id
 template [[host_name("kernel_mul_mv_id_q4_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q4_K_f32_impl   <N_R0_Q4_K>>>;
 template [[host_name("kernel_mul_mv_id_q4_k_hifi_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q4_k_hifi_f32_impl<N_R0_Q4_K_HIFI>>>;
 template [[host_name("kernel_mul_mv_id_q5_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_f32_impl   <N_R0_Q5_K>>>;
+template [[host_name("kernel_mul_mv_id_q5_K_hifi_res8_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_hifi_res8_f32_impl<N_R0_Q5_K>>>;
 template [[host_name("kernel_mul_mv_id_q6_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q6_K_f32_impl   <N_R0_Q6_K>>>;
 template [[host_name("kernel_mul_mv_id_iq1_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_s_f32_impl  <N_R0_IQ1_S>>>;
 template [[host_name("kernel_mul_mv_id_iq1_m_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_m_f32_impl  <N_R0_IQ1_M>>>;

From 4bc13c25cee56f8bc4fca6cfff549b76fcd34810 Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Sun, 1 Mar 2026 15:48:07 +1300
Subject: [PATCH 6/8] Build error fixed

---
 ggml/src/ggml-metal/ggml-metal-device.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index 54df6761490..3faab1867f3 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -144,6 +144,8 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_2d(ggml_met
     return res;
 }
 
+static const char * ggml_metal_type_name_for_kernel(ggml_type type); // forward declaration
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows(ggml_metal_library_t lib, ggml_type tsrc) {
     char base[256];
     char name[256];

From 3252ed2eee6729009755e960a8243d04d06e7b3d Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Sun, 1 Mar 2026 21:57:19 +1300
Subject: [PATCH 7/8] NaN errors fixed

---
 ggml/src/ggml-metal/ggml-metal-device.cpp |   3 +-
 ggml/src/ggml-metal/ggml-metal.metal      | 129 ++++++++++++++++++++++
 2 files changed, 131 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index 3faab1867f3..cda71256341 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -547,8 +547,9 @@ static const char * ggml_metal_type_name_for_kernel(ggml_type type) {
             return "q4_k_hifi";
         case GGML_TYPE_Q6_K_HIFI:
         case GGML_TYPE_Q6_K_HIFI_DYNAMIC:
-        case GGML_TYPE_Q6_K_HIFI_RES8:
             return "q6_K";
+        case GGML_TYPE_Q6_K_HIFI_RES8:
+            return "q6_K_hifi_res8";
         case GGML_TYPE_Q5_K_HIFI_RES8:
             return "q5_K_hifi_res8";
         default:
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 371b6000d45..93c50b5ecac 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -677,6 +677,14 @@ void dequantize_q6_K(device const block_q6_K *xb, short il, thread type4x4 & reg
     }
 }
 
+// Q6_K_HIFI_RES8: Q6_K layout + 22-byte INT8 residual extension (232 bytes total)
+// The base Q6_K fields (ql, qh, scales, d) are at identical byte offsets.
+// Residual corrections are not applied in the Metal path (only in CPU path).
+template <typename type4x4>
+void dequantize_q6_k_hifi_res8(device const block_q6_k_hifi_res8 * xb, short il, thread type4x4 & reg) {
+    dequantize_q6_K((device const block_q6_K *)xb, il, reg);
+}
+
 template <typename type4x4>
 void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x4 & reg) {
     // il is 0...15 for QK_K = 256 => index of block of 32 is il/2
@@ -3830,6 +3838,13 @@ template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_3")]] kernel mul_mv_ext_q4x4
 template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_4")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_K, 256, dequantize_q6_K>;
 template [[host_name("kernel_mul_mv_ext_q6_K_f32_r1_5")]] kernel mul_mv_ext_q4x4_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_K, 256, dequantize_q6_K>;
 
+typedef decltype(kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_k_hifi_res8, 256, dequantize_q6_k_hifi_res8>) mul_mv_ext_q6_K_hifi_res8_f32_t;
+
+template [[host_name("kernel_mul_mv_ext_q6_K_hifi_res8_f32_r1_2")]] kernel mul_mv_ext_q6_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<2, block_q6_k_hifi_res8, 256, dequantize_q6_k_hifi_res8>;
+template [[host_name("kernel_mul_mv_ext_q6_K_hifi_res8_f32_r1_3")]] kernel mul_mv_ext_q6_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<3, block_q6_k_hifi_res8, 256, dequantize_q6_k_hifi_res8>;
+template [[host_name("kernel_mul_mv_ext_q6_K_hifi_res8_f32_r1_4")]] kernel mul_mv_ext_q6_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<4, block_q6_k_hifi_res8, 256, dequantize_q6_k_hifi_res8>;
+template [[host_name("kernel_mul_mv_ext_q6_K_hifi_res8_f32_r1_5")]] kernel mul_mv_ext_q6_K_hifi_res8_f32_t kernel_mul_mv_ext_q4x4_f32_disp<5, block_q6_k_hifi_res8, 256, dequantize_q6_k_hifi_res8>;
+
 template<typename T0, typename T1, short NR0, typename args_t>
 void kernel_mul_mv_t_t_impl(
         args_t args,
@@ -8124,6 +8139,114 @@ kernel void kernel_mul_mv_q6_K_f32(
     kernel_mul_mv_q6_K_f32_impl<N_R0_Q6_K, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+template<int nr0, typename args_t>
+void kernel_mul_mv_q6_K_hifi_res8_f32_impl(
+        args_t args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        threadgroup  char * shmem,
+        uint3  tgpig,
+        ushort tiisg,
+        ushort sgitg) {
+    const short NSG = FC_mul_mv_nsg;
+
+    constexpr uint8_t kmask1 = 0x03;
+    constexpr uint8_t kmask2 = 0x0C;
+    constexpr uint8_t kmask3 = 0x30;
+    constexpr uint8_t kmask4 = 0xC0;
+
+    const int nb = args.ne00/QK_K;
+
+    const int r0 = tgpig.x;
+    const int r1 = tgpig.y;
+    const int im = tgpig.z;
+
+    const int first_row = (r0 * NSG + sgitg) * nr0;
+
+    const uint i12 = im%args.ne12;
+    const uint i13 = im/args.ne12;
+
+    const uint64_t offset0 = first_row*args.nb01 + (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const uint64_t offset1 =        r1*args.nb11 + (i12        )*args.nb12 + (i13        )*args.nb13;
+
+    device const block_q6_k_hifi_res8 * x = (device const block_q6_k_hifi_res8 *) (src0 + offset0);
+    device const float               * yy = (device const float                 *) (src1 + offset1);
+
+    float sumf[nr0] = { 0.f };
+
+    float yl[16];
+
+    const short tid = tiisg/2;
+    const short ix  = tiisg%2;
+    const short ip  = tid/8;         // 0 or 1
+    const short il  = tid%8;
+    const short l0  = 4*il;
+    const short is  = 8*ip + l0/16;
+
+    const short y_offset   = 128*ip + l0;
+    const short q_offset_l =  64*ip + l0;
+    const short q_offset_h =  32*ip + l0;
+
+    for (int i = ix; i < nb; i += 2) {
+        device const uint8_t * q1 = x[i].ql + q_offset_l;
+        device const uint8_t * q2 = q1 + 32;
+        device const uint8_t * qh = x[i].qh + q_offset_h;
+        device const int8_t  * sc = x[i].scales + is;
+        device const half    * dh = &x[i].d;
+
+        device const float * y = yy + i * QK_K + y_offset;
+
+        for (short l = 0; l < 4; ++l) {
+            yl[4*l + 0] = y[l +  0];
+            yl[4*l + 1] = y[l + 32];
+            yl[4*l + 2] = y[l + 64];
+            yl[4*l + 3] = y[l + 96];
+        }
+
+        for (short row = 0; row < nr0; ++row) {
+            float4 sums = {0.f, 0.f, 0.f, 0.f};
+
+            FOR_UNROLL (short l = 0; l < 4; ++l) {
+                sums[0] += yl[4*l + 0] * ((int8_t)((q1[l] & 0xF) | ((qh[l] & kmask1) << 4)) - 32);
+                sums[1] += yl[4*l + 1] * ((int8_t)((q2[l] & 0xF) | ((qh[l] & kmask2) << 2)) - 32);
+                sums[2] += yl[4*l + 2] * ((int8_t)((q1[l]  >> 4) | ((qh[l] & kmask3) << 0)) - 32);
+                sums[3] += yl[4*l + 3] * ((int8_t)((q2[l]  >> 4) | ((qh[l] & kmask4) >> 2)) - 32);
+            }
+
+            sumf[row] += dh[0] * (sums[0] * sc[0] + sums[1] * sc[2] + sums[2] * sc[4] + sums[3] * sc[6]);
+
+            q1 += args.nb01;
+            q2 += args.nb01;
+            qh += args.nb01;
+            sc += args.nb01;
+            dh += args.nb01/2;
+        }
+    }
+
+    device float * dst_f32 = (device float *) dst + (uint64_t)im*args.ne0*args.ne1 + (uint64_t)r1*args.ne0;
+
+    for (int row = 0; row < nr0 && first_row + row < args.ne0; ++row) {
+        float sum_all = simd_sum(sumf[row]);
+        if (tiisg == 0) {
+            dst_f32[first_row + row] = sum_all;
+        }
+    }
+}
+
+[[host_name("kernel_mul_mv_q6_K_hifi_res8_f32")]]
+kernel void kernel_mul_mv_q6_K_hifi_res8_f32(
+        constant ggml_metal_kargs_mul_mv & args,
+        device const char * src0,
+        device const char * src1,
+        device       char * dst,
+        uint3  tgpig[[threadgroup_position_in_grid]],
+        ushort tiisg[[thread_index_in_simdgroup]],
+        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    kernel_mul_mv_q6_K_hifi_res8_f32_impl<N_R0_Q6_K, constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
+}
+
 // ======================= "True" 2-bit
 
 template<int nr0, typename args_t>
@@ -10043,6 +10166,7 @@ template [[host_name("kernel_get_rows_q4_k_hifi")]] kernel get_rows_q_t kernel_g
 template [[host_name("kernel_get_rows_q5_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q5_K,    QK_NL, dequantize_q5_K>;
 template [[host_name("kernel_get_rows_q5_k_hifi_res8")]] kernel get_rows_q_t kernel_get_rows_q<block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8>;
 template [[host_name("kernel_get_rows_q6_K")]]    kernel get_rows_q_t kernel_get_rows_q<block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_get_rows_q6_K_hifi_res8")]] kernel get_rows_q_t kernel_get_rows_q<block_q6_k_hifi_res8, QK_NL, dequantize_q6_k_hifi_res8>;
 template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
 template [[host_name("kernel_get_rows_iq2_xs")]]  kernel get_rows_q_t kernel_get_rows_q<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
 template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_q_t kernel_get_rows_q<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
@@ -10108,6 +10232,7 @@ template [[host_name("kernel_mul_mm_q4_k_hifi_f32")]] kernel mul_mm_t kernel_mul
 template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_q5_K_hifi_res8_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  float, float2x4>;
+template [[host_name("kernel_mul_mm_q6_K_hifi_res8_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_k_hifi_res8, QK_NL, dequantize_q6_k_hifi_res8, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float,  float4x4,  float, float2x4>;
@@ -10134,6 +10259,7 @@ template [[host_name("kernel_mul_mm_q4_k_hifi_f16")]] kernel mul_mm_t kernel_mul
 template [[host_name("kernel_mul_mm_q5_K_f16")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_q5_K_hifi_res8_f16")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_q6_K_f16")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  half, half2x4>;
+template [[host_name("kernel_mul_mm_q6_K_hifi_res8_f16")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_k_hifi_res8, QK_NL, dequantize_q6_k_hifi_res8, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_iq2_xxs_f16")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_iq2_xs_f16")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_iq3_xxs_f16")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float,  float4x4,  half, half2x4>;
@@ -10169,6 +10295,7 @@ template [[host_name("kernel_mul_mm_id_q4_k_hifi_f32")]] kernel mul_mm_id kernel
 template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q5_K_hifi_res8_f32")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  float, float2x4>;
+template [[host_name("kernel_mul_mm_id_q6_K_hifi_res8_f32")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_k_hifi_res8, QK_NL, dequantize_q6_k_hifi_res8, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  float, float2x4>;
 template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float,  float4x4,  float, float2x4>;
@@ -10195,6 +10322,7 @@ template [[host_name("kernel_mul_mm_id_q4_k_hifi_f16")]] kernel mul_mm_id kernel
 template [[host_name("kernel_mul_mm_id_q5_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K,    float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q5_K_hifi_res8_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q5_k_hifi_res8, QK_NL, dequantize_q5_k_hifi_res8, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_q6_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K,    float,  float4x4,  half, half2x4>;
+template [[host_name("kernel_mul_mm_id_q6_K_hifi_res8_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_q6_k_hifi_res8, QK_NL, dequantize_q6_k_hifi_res8, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs,  float,  float4x4,  half, half2x4>;
 template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half,   half2x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float,  float4x4,  half, half2x4>;
@@ -10353,6 +10481,7 @@ template [[host_name("kernel_mul_mv_id_q4_k_hifi_f32")]] kernel kernel_mul_mv_id
 template [[host_name("kernel_mul_mv_id_q5_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_f32_impl   <N_R0_Q5_K>>>;
 template [[host_name("kernel_mul_mv_id_q5_K_hifi_res8_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q5_K_hifi_res8_f32_impl<N_R0_Q5_K>>>;
 template [[host_name("kernel_mul_mv_id_q6_K_f32")]]    kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q6_K_f32_impl   <N_R0_Q6_K>>>;
+template [[host_name("kernel_mul_mv_id_q6_K_hifi_res8_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_q6_K_hifi_res8_f32_impl<N_R0_Q6_K>>>;
 template [[host_name("kernel_mul_mv_id_iq1_s_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_s_f32_impl  <N_R0_IQ1_S>>>;
 template [[host_name("kernel_mul_mv_id_iq1_m_f32")]]   kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq1_m_f32_impl  <N_R0_IQ1_M>>>;
 template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id<mmv_fn<kernel_mul_mv_iq2_xxs_f32_impl<N_R0_IQ2_XXS>>>;

From c4861e9352da6ba49e957e223a13bd1480c0084c Mon Sep 17 00:00:00 2001
From: Geoff Munn <geoff@zimoobo.com>
Date: Mon, 2 Mar 2026 18:55:24 +1300
Subject: [PATCH 8/8] Add support for Q5_K_HIFI_RES8 layout in
 mmq_get_q8_1_ds_layout function

---
 ggml/src/ggml-cuda/mmq.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index a382e6a6979..efe9e03459c 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -74,6 +74,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
             return MMQ_Q8_1_DS_LAYOUT_D4;
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q5_K_HIFI_RES8:  // uses Q5_K MMQ kernel after compact copy
             return MMQ_Q8_1_DS_LAYOUT_DS4;
         case GGML_TYPE_Q6_K:
         case GGML_TYPE_IQ2_XXS: