From 69739f9ef50c99536f53cb188a380d4a91cf4606 Mon Sep 17 00:00:00 2001 From: kskd1804 Date: Wed, 17 Jul 2024 17:57:52 -0700 Subject: [PATCH 1/3] Added dequantize_q4_1 function to ggml --- src/transformers/integrations/ggml.py | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index 5c2d72c345ec..ea7c9d1e026d 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -52,6 +52,7 @@ "Q4_K": 144, # Q4_0 uses a blocksize of 32 but the 4-bit tensors are packed into 8-bit tensors + 2 bytes for the scales "Q4_0": 2 + 16, + "Q4_1": 2 + 2 + 16, "Q6_K": 210, # See: https://github.com/99991/pygguf/commit/a417edbfc029a1bc270f984a694f9128c5afa8b9 "Q2_K": 256 // 16 + 256 // 4 + 2 + 2, @@ -273,6 +274,36 @@ def dequantize_q4_0(data): return (scales * quants).astype(np.float32) +def dequantize_q4_1(data): + # C implementation + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L1106 + # C struct definition + # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.h#L18 + block_size = GGML_BLOCK_SIZES["Q4_1"] + num_blocks = len(data) // block_size + + data_f16 = np.frombuffer(data, dtype=np.float16).reshape(num_blocks, block_size // 2) + data_u8 = np.frombuffer(data, dtype=np.uint8).reshape(num_blocks, block_size) + + # The scales are stored on the first 2 bytes + scales = data_f16[:, 0].reshape(num_blocks, 1).astype(np.float32) + # scales = np.nan_to_num(scales) + + # The mins are stored on the second 2 bytes + mins = data_f16[:, 1].reshape(num_blocks, 1).astype(np.float32) + + # the rest of the bytes corresponds to the quants - we discard the first four bytes + quants = data_u8[:, 4:] + + ql = (quants[:, :] & 0xF).astype(np.int8) + qr = (quants[:, :] >> 4).astype(np.int8) + + # Use hstack + quants = np.hstack([ql, qr]) + + return ((scales * quants) + mins).astype(np.float32) + + def dequantize_q6_k(data): # C implementation # https://github.com/ggerganov/ggml/blob/fca1caafea7de9fbd7efc733b9818f9cf2da3050/src/ggml-quants.c#L2275 From b0e94fc446a0c3049cbe0b3d4816aa03d8214089 Mon Sep 17 00:00:00 2001 From: kskd1804 Date: Wed, 17 Jul 2024 17:58:14 -0700 Subject: [PATCH 2/3] Added ggml type for q4_1 --- src/transformers/integrations/ggml.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index ea7c9d1e026d..b18c832eff25 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -37,6 +37,7 @@ GGML_TYPES = { "F32": 0, "Q4_0": 2, + "Q4_1": 3, "Q8_0": 8, "Q2_K": 10, "Q3_K": 11, From 31ec19ca313d35f8ec4365cee86ecceea5135e25 Mon Sep 17 00:00:00 2001 From: kskd1804 Date: Wed, 17 Jul 2024 18:35:55 -0700 Subject: [PATCH 3/3] Added calls to dequantize_q4_1 function --- src/transformers/integrations/ggml.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/integrations/ggml.py b/src/transformers/integrations/ggml.py index b18c832eff25..86a09902cabf 100644 --- a/src/transformers/integrations/ggml.py +++ b/src/transformers/integrations/ggml.py @@ -525,6 +525,8 @@ def load_dequant_gguf_tensor(shape, ggml_type, data): values = dequantize_q8_0(data) elif ggml_type == GGML_TYPES["Q4_0"]: values = dequantize_q4_0(data) + elif ggml_type == GGML_TYPES["Q4_1"]: + values = dequantize_q4_1(data) elif ggml_type == GGML_TYPES["Q4_K"]: values = dequantize_q4_k(data) elif ggml_type == GGML_TYPES["Q6_K"]: