From 69739f9ef50c99536f53cb188a380d4a91cf4606 Mon Sep 17 00:00:00 2001
From: kskd1804 <kskd1804@gmail.com>
Date: Wed, 17 Jul 2024 17:57:52 -0700
Subject: [PATCH 1/3] Added dequantize_q4_1 function to ggml

---
 src/transformers/integrations/ggml.py | 31 +++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index 5c2d72c345ec..ea7c9d1e026d 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -52,6 +52,7 @@
     "Q4_K": 144,
     # Q4_0 uses a blocksize of 32 but the 4-bit tensors are packed into 8-bit tensors + 2 bytes for the scales
     "Q4_0": 2 + 16,
+    "Q4_1": 2 + 2 + 16,
     "Q6_K": 210,
     # See: https://github.com/99991/pygguf/commit/a417edbfc029a1bc270f984a694f9128c5afa8b9
     "Q2_K": 256 // 16 + 256 // 4 + 2 + 2,
@@ -273,6 +274,36 @@ def dequantize_q4_0(data):
     return (scales * quants).astype(np.float32)
 
 
+def dequantize_q4_1(data):
+    # C implementation
+    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1106
+    # C struct definition
+    # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L18
+    block_size = GGML_BLOCK_SIZES["Q4_1"]
+    num_blocks = len(data) // block_size
+
+    data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2)
+    data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size)
+
+    # The scales are stored on the first 2 bytes
+    scales = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32)    
+    # scales = np.nan_to_num(scales)
+
+    # The mins are stored on the second 2 bytes
+    mins = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32)
+
+    # the rest of the bytes corresponds to the quants - we discard the first four bytes
+    quants = data_u8[:, 4:]
+
+    ql = (quants[:, :] & 0xF).astype(np.int8)
+    qr = (quants[:, :] >> 4).astype(np.int8)
+
+    # Use hstack
+    quants = np.hstack([ql, qr])
+
+    return ((scales * quants) + mins).astype(np.float32)
+
+
 def dequantize_q6_k(data):
     # C implementation
     # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275

From b0e94fc446a0c3049cbe0b3d4816aa03d8214089 Mon Sep 17 00:00:00 2001
From: kskd1804 <kskd1804@gmail.com>
Date: Wed, 17 Jul 2024 17:58:14 -0700
Subject: [PATCH 2/3] Added ggml type for q4_1

---
 src/transformers/integrations/ggml.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index ea7c9d1e026d..b18c832eff25 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -37,6 +37,7 @@
 GGML_TYPES = {
     "F32": 0,
     "Q4_0": 2,
+    "Q4_1": 3,
     "Q8_0": 8,
     "Q2_K": 10,
     "Q3_K": 11,

From 31ec19ca313d35f8ec4365cee86ecceea5135e25 Mon Sep 17 00:00:00 2001
From: kskd1804 <kskd1804@gmail.com>
Date: Wed, 17 Jul 2024 18:35:55 -0700
Subject: [PATCH 3/3] Added calls to dequantize_q4_1 function

---
 src/transformers/integrations/ggml.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py
index b18c832eff25..86a09902cabf 100644
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@@ -525,6 +525,8 @@ def load_dequant_gguf_tensor(shape, ggml_type, data):
         values = dequantize_q8_0(data)
     elif ggml_type == GGML_TYPES["Q4_0"]:
         values = dequantize_q4_0(data)
+    elif ggml_type == GGML_TYPES["Q4_1"]:
+        values = dequantize_q4_1(data)
     elif ggml_type == GGML_TYPES["Q4_K"]:
         values = dequantize_q4_k(data)
     elif ggml_type == GGML_TYPES["Q6_K"]: