From 7772fa3d35e5b76f8113e46376ff1b15699bd276 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 6 May 2024 15:45:01 +0000
Subject: [PATCH 01/76] doc-builder image had been changed, need to revert to
 old one due to dependency issues, see
 https://huggingface.slack.com/archives/C021H1P1HKR/p1714469081588139

---
 .github/workflows/build_documentation.yml    | 1 +
 .github/workflows/build_pr_documentation.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index 10272be87..a19e7511d 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -14,5 +14,6 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      custom_container: huggingface/transformers-doc-builder
     secrets:
       hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index d6455fd11..cc833df5d 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -16,3 +16,4 @@ jobs:
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      custom_container: huggingface/transformers-doc-builder

From b659c70d5cd78b80526333755b75cbf2a8c60663 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 7 May 2024 15:36:17 +0200
Subject: [PATCH 02/76] Update CONTRIBUTING.md

---
 CONTRIBUTING.md | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0fae0ace5..76d7327a8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,23 +9,12 @@ We actively welcome your pull requests.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes.
-5. Make sure your code lints.
-6. If you haven't already, complete the Contributor License Agreement ("CLA").
-
-## Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Facebook's open source projects.
-
-Complete your CLA here: <https://code.facebook.com/cla>
+5. Make sure your code lints, install the [pre-commit hooks as documented here](https://huggingface.co/docs/bitsandbytes/main/en/contributing#setup-pre-commit-hooks).
 
 ## Issues
 We use GitHub issues to track public bugs. Please ensure your description is
 clear and has sufficient instructions to be able to reproduce the issue.
 
-Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
-disclosure of security bugs. In those cases, please go through the process
-outlined on that page and do not file a public issue.
-
 ## License
 By contributing to bitsandbytes, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.

From b97ea779a8d0a20e1d6f9ce265416ef6fbbc7a89 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 7 May 2024 15:48:12 +0200
Subject: [PATCH 03/76] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2cf630dcb..def6ab52d 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,6 @@ There are ongoing efforts to support further hardware backends, i.e. Intel CPU +
 
 ## License
 
-The majority of bitsandbytes is licensed under MIT, however small portions of the project are available under separate license terms, as the parts adapted from Pytorch are licensed under the BSD license.
+`bitsandbytes` MIT licensed.
 
 We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization.

From b891f80ba514833f41f0e9226983b02a9fb5c44b Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 7 May 2024 15:49:37 +0200
Subject: [PATCH 04/76] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index def6ab52d..3f40b1e0e 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,6 @@ There are ongoing efforts to support further hardware backends, i.e. Intel CPU +
 
 ## License
 
-`bitsandbytes` MIT licensed.
+`bitsandbytes` is MIT licensed.
 
 We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization.

From 09cc153dea939f23747bea622560c84b5a95183f Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Wed, 8 May 2024 02:10:49 -0700
Subject: [PATCH 05/76] Support NF4 on CPU backend

---
 bitsandbytes/autograd/_functions.py     |   3 +-
 bitsandbytes/backends/cpu.py            |  15 +-
 bitsandbytes/backends/cpu_xpu_common.py | 266 +++++++++++++++++++++++-
 bitsandbytes/nn/modules.py              |   7 +-
 4 files changed, 284 insertions(+), 7 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 7d570f28b..6dea211ff 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -572,7 +572,8 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False:
+    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
+        # CPU backend does not require A to be a vector
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index d6a9192e4..a5e123e62 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -9,6 +9,9 @@
     double_quant_impl,
     igemmlt_impl,
     mm_dequant_impl,
+    quantize_4bit_impl,
+    dequantize_4bit_impl,
+    gemm_4bit_impl,
 )
 
 Tensor = torch.Tensor
@@ -132,7 +135,8 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        assert_on_cpu([A, absmax, out])
+        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
 
     def dequantize_4bit(
         self,
@@ -143,7 +147,8 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        assert_on_cpu([A, absmax, out])
+        return dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
 
     def gemv_4bit(
         self,
@@ -154,7 +159,11 @@ def gemv_4bit(
         transposed_B=False,
         state: QuantState = None,
     ) -> torch.Tensor:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        assert_on_cpu([A, B, out])
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+
+        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index f4e5ed3ec..078b81680 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,6 +1,12 @@
 import warnings
-
 import torch
+from typing import Optional
+from bitsandbytes.functional import (
+    get_4bit_type,
+    quantize_blockwise,
+    dequantize_blockwise,
+    QuantState,
+)
 
 try:
     # to support Intel CPU/GPU (XPU) backend
@@ -228,3 +234,261 @@ def mm_dequant_impl(
         out = out + bias.to(compute_dtype)
     out = out.to(output_dtype)
     return out
+
+
+NF4_QUANT_TABLE = [
+    -1.0 - 1e-2,           # 0b0000
+    -0.8480964004993439,   # 0b0001
+    -0.6106329262256622,   # 0b0010
+    -0.4599952697753906,   # 0b0011
+    -0.33967943489551544,  # 0b0100
+    -0.23460740596055984,  # 0b0101
+    -0.13791173323988914,  # 0b0110
+    -0.045525018125772476, # 0b0111
+    0.03979014977812767,   # 0b1000
+    0.1202552504837513,    # 0b1001
+    0.2035212516784668,    # 0b1010
+    0.2920137718319893,    # 0b1011
+    0.3893125355243683,    # 0b1100
+    0.5016634166240692,    # 0b1101
+    0.6427869200706482,    # 0b1110
+    0.8614784181118011,    # 0b1111
+]
+
+
+# It's faster not to use torch.compile
+def quantize_4bit_impl(
+    A: Tensor,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize=64,
+    compress_statistics=False,
+    quant_type="nf4",
+) -> Tensor:
+    """
+    Quantize tensor A in blocks of 4-bit values.
+
+    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        The output tensor (8-bit).
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+
+    Returns
+    -------
+    torch.Tensor:
+        The 8-bit tensor with packed 4-bit values.
+    tuple(torch.Tensor, torch.Size, torch.dtype, int):
+        The quantization state to undo the quantization.
+    """
+    if quant_type != "nf4":
+        raise NotImplementedError(
+            f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU."
+        )
+    n = A.numel()
+    input_shape = A.shape
+    blocks = n // blocksize
+    blocks += 1 if n % blocksize > 0 else 0
+
+    if absmax is None:
+        absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype)
+
+    if out is None:
+        out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
+
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+    rem = n % blocksize
+    has_rem = rem > 0
+
+    # Scale tensor to [-1, 1]
+    A_reshaped = A.reshape(n)
+    A_com = A_reshaped[:n - rem]
+    A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+    absmax[:blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[:blocks - has_rem].view(-1, 1)), -1, 1)
+    scaled_A = scaled_A.reshape(-1)
+    if has_rem:
+        absmax[-1] = torch.abs(A_reshaped[n - rem:]).max()
+        scaled_A_rem = torch.clamp(A_reshaped[n - rem:] * (1 / absmax[-1]), -1, 1)
+        scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+    # map [-1, 1] to nf4
+    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
+    for i in range(len(NF4_QUANT_TABLE)):
+        out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    if out_uint8.size(-1) % 2:
+        out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
+    out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+
+    code = get_4bit_type(quant_type, device=A.device)
+
+    if compress_statistics:
+        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+    else:
+        state = QuantState(
+            absmax=absmax,
+            shape=input_shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+        )
+
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and input_shape[0] % blocksize == 0:
+        state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
+            out.reshape([input_shape[0], input_shape[1] // 2]),
+            ipex_cpu.quantization.WoqWeightDtype.NF4,
+            input_shape, # weight shape
+            absmax.view(input_shape[0], input_shape[1] // blocksize), # scales
+            None, # zero_points
+            None, # bias
+            None, # g_idx
+            None, # batch_size
+            blocksize,
+            int(ipex_cpu.quantization.WoqLowpMode.BF16),
+            -1, # act_quant_mode
+        )
+
+    return out, state
+
+
+@_maybe_torch_compile
+def dequantize_4bit_impl(
+    A: Tensor,
+    quant_state = None,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize: int = 64,
+    quant_type="nf4",
+) -> Tensor:
+    """
+    Dequantizes FP4 blockwise quantized values.
+
+    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input 8-bit tensor (packed 4-bit values).
+    quant_state : QuantState
+        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        Dequantized output tensor.
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+
+
+    Returns
+    -------
+    torch.Tensor:
+        Dequantized tensor.
+    """
+
+    if quant_state is None:
+        assert absmax is not None and out is not None
+
+        quant_state = QuantState(
+            absmax=absmax,
+            shape=out.shape,
+            dtype=out.dtype,
+            blocksize=blocksize,
+            quant_type=quant_type,
+        )
+
+    else:
+        absmax = quant_state.absmax
+
+    if quant_state.quant_type != "nf4":
+        raise NotImplementedError(
+            f"4-bit quantization data type {quant_state.quant_type} is not implemented for CPU/XPU."
+        )
+
+    if quant_state.nested:
+        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+
+    if out is None:
+        out = torch.empty(
+            quant_state.shape, dtype=quant_state.dtype, device=A.device
+        )
+
+    n = out.numel()
+    # Map nf4 to [-1, 1]
+    out_uint8 = torch.empty(A.size(0) * 2, dtype=torch.uint8, device=A.device)
+    out_uint8[::2] = A.bitwise_and(0xF)
+    out_uint8[1::2] = A.bitwise_right_shift(4)
+    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype)
+    for i in range(len(quant_state.code)):
+        out_dq[out_uint8 == i] = quant_state.code[i]
+
+    # Apply scales
+    if out_dq.numel() != n:
+        assert out_dq.numel() == n + 1
+        out_dq = torch.narrow(out_dq, 0, 0, n)
+    blocks = n // blocksize
+    blocks += 1 if n % blocksize > 0 else 0
+    rem = n % blocksize
+    has_rem = rem > 0
+    out_reshaped = out.reshape(-1)
+    out_reshaped[:n - rem] = (out_dq[:n - rem].view(-1, blocksize) * absmax[:blocks - has_rem].view(-1, 1)).reshape(-1)
+    if has_rem:
+        out_reshaped[n - rem:] = out_dq[n - rem:] * absmax[-1]
+
+    # take transpose here because weight is transposed (again) for computation
+    return out.t()
+
+
+# Do not need torch.compile here as we are calling torch/ipex kernel
+def gemm_4bit_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    transposed_A=False,
+    transposed_B=False,
+    state: QuantState = None,
+) -> torch.Tensor:
+    """
+    Matrix-matrix multiplication with 4-bit quantization.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The first input tensor. Usually the activation tensor.
+    B : torch.Tensor
+        The second input tensor. Usually the weight tensor.
+    out : torch.Tensor
+        The output tensor.
+    transposed_A : bool
+        Whether A is transposed
+    transposed_B : bool
+        Whether B is transposed
+    state : QuantState
+        Contains quantization info, such as blocksize and dtype
+
+    Returns
+    -------
+    torch.Tensor:
+        GEMM output tensor.
+    """
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and hasattr(state, "op_context"):
+        assert state.op_context is not None
+        output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
+    else:
+        dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
+        output = torch.matmul(A, dqB)
+    if out is not None:
+        out.copy_(output)
+    else:
+        out = output
+    return out
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 7e9ab8d05..d52cb4847 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -285,7 +285,7 @@ def from_prequantized(
         return self
 
     def _quantize(self, device):
-        w = self.data.contiguous().cuda(device)
+        w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
             w,
             blocksize=self.blocksize,
@@ -303,6 +303,9 @@ def _quantize(self, device):
     def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
+    def cpu(self, non_blocking: bool = False):
+        return self.to(device="cpu", non_blocking=non_blocking)
+
     @overload
     def to(
         self: T,
@@ -320,7 +323,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type == "cuda" and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:

From 177bd398b3235f586e9e2110b6ffe8288eef4f00 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Fri, 10 May 2024 00:22:04 -0700
Subject: [PATCH 06/76] Minor improvements

---
 bitsandbytes/backends/cpu.py            | 1 +
 bitsandbytes/backends/cpu_xpu_common.py | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index a5e123e62..80b6c241e 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -136,6 +136,7 @@ def quantize_4bit(
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
         assert_on_cpu([A, absmax, out])
+        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
         return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
 
     def dequantize_4bit(
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 078b81680..ab881c6dd 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -343,6 +343,8 @@ def quantize_4bit_impl(
         )
 
     if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and input_shape[0] % blocksize == 0:
+        # lowp_mode: lowest precision for computation
+        lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16
         state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
             out.reshape([input_shape[0], input_shape[1] // 2]),
             ipex_cpu.quantization.WoqWeightDtype.NF4,
@@ -353,8 +355,8 @@ def quantize_4bit_impl(
             None, # g_idx
             None, # batch_size
             blocksize,
-            int(ipex_cpu.quantization.WoqLowpMode.BF16),
-            -1, # act_quant_mode
+            int(lowp_mode),
+            -1, # act_quant_mode. -1 means don't quant activation
         )
 
     return out, state

From 881b5fcd0bc77f747850f397a0bf02c288332c17 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Fri, 10 May 2024 22:34:32 -0700
Subject: [PATCH 07/76] Add fp4 support; add UT; fix lint issues

---
 bitsandbytes/backends/cpu.py            |   4 +-
 bitsandbytes/backends/cpu_xpu_common.py | 109 ++++++++++++++----------
 tests/test_functional.py                |  50 ++++++++++-
 3 files changed, 114 insertions(+), 49 deletions(-)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 80b6c241e..2c3688251 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -6,12 +6,12 @@
 
 from .base import Backend
 from .cpu_xpu_common import (
+    dequantize_4bit_impl,
     double_quant_impl,
+    gemm_4bit_impl,
     igemmlt_impl,
     mm_dequant_impl,
     quantize_4bit_impl,
-    dequantize_4bit_impl,
-    gemm_4bit_impl,
 )
 
 Tensor = torch.Tensor
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index ab881c6dd..8d87f7e2f 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,11 +1,11 @@
+from typing import Optional
 import warnings
+
 import torch
-from typing import Optional
+
 from bitsandbytes.functional import (
-    get_4bit_type,
-    quantize_blockwise,
-    dequantize_blockwise,
     QuantState,
+    get_4bit_type,
 )
 
 try:
@@ -237,25 +237,37 @@ def mm_dequant_impl(
 
 
 NF4_QUANT_TABLE = [
-    -1.0 - 1e-2,           # 0b0000
-    -0.8480964004993439,   # 0b0001
-    -0.6106329262256622,   # 0b0010
-    -0.4599952697753906,   # 0b0011
+    -1.0 - 1e-2,  # 0b0000
+    -0.8480964004993439,  # 0b0001
+    -0.6106329262256622,  # 0b0010
+    -0.4599952697753906,  # 0b0011
     -0.33967943489551544,  # 0b0100
     -0.23460740596055984,  # 0b0101
     -0.13791173323988914,  # 0b0110
-    -0.045525018125772476, # 0b0111
-    0.03979014977812767,   # 0b1000
-    0.1202552504837513,    # 0b1001
-    0.2035212516784668,    # 0b1010
-    0.2920137718319893,    # 0b1011
-    0.3893125355243683,    # 0b1100
-    0.5016634166240692,    # 0b1101
-    0.6427869200706482,    # 0b1110
-    0.8614784181118011,    # 0b1111
+    -0.045525018125772476,  # 0b0111
+    0.03979014977812767,  # 0b1000
+    0.1202552504837513,  # 0b1001
+    0.2035212516784668,  # 0b1010
+    0.2920137718319893,  # 0b1011
+    0.3893125355243683,  # 0b1100
+    0.5016634166240692,  # 0b1101
+    0.6427869200706482,  # 0b1110
+    0.8614784181118011,  # 0b1111
 ]
 
 
+FP4_QUANT_TABLE = {
+    0 - 1e-2: 0,  # 0b0000
+    0.00260417: 1,  # 0b0001
+    0.0859375: 6,  # 0b0110
+    0.20833333: 7,  # 0b0111
+    0.29166667: 4,  # 0b0100
+    0.4166667: 5,  # 0b0101
+    0.583333: 2,  # 0b0010
+    0.8333333: 3,  # 0b0011
+}
+
+
 # It's faster not to use torch.compile
 def quantize_4bit_impl(
     A: Tensor,
@@ -290,10 +302,11 @@ def quantize_4bit_impl(
     tuple(torch.Tensor, torch.Size, torch.dtype, int):
         The quantization state to undo the quantization.
     """
-    if quant_type != "nf4":
-        raise NotImplementedError(
-            f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU."
-        )
+    if quant_type not in ["nf4", "fp4"]:
+        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU.")
+    if quant_type == "fp4":
+        warnings.warn("fp4 quantization is currently slow on CPU/XPU. Please Use nf4 instead for better performance.")
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
     n = A.numel()
     input_shape = A.shape
     blocks = n // blocksize
@@ -305,25 +318,31 @@ def quantize_4bit_impl(
     if out is None:
         out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
 
-    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
     rem = n % blocksize
     has_rem = rem > 0
 
     # Scale tensor to [-1, 1]
     A_reshaped = A.reshape(n)
-    A_com = A_reshaped[:n - rem]
+    A_com = A_reshaped[: n - rem]
     A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
-    absmax[:blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
-    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[:blocks - has_rem].view(-1, 1)), -1, 1)
+    absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1)
     scaled_A = scaled_A.reshape(-1)
     if has_rem:
-        absmax[-1] = torch.abs(A_reshaped[n - rem:]).max()
-        scaled_A_rem = torch.clamp(A_reshaped[n - rem:] * (1 / absmax[-1]), -1, 1)
+        absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+        scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
         scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
-    # map [-1, 1] to nf4
+    # map [-1, 1] to nf4/fp4
     out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
-    for i in range(len(NF4_QUANT_TABLE)):
-        out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    if quant_type == "nf4":
+        for i in range(len(NF4_QUANT_TABLE)):
+            out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    elif quant_type == "fp4":
+        sign = scaled_A < 0
+        abs_scaled_A = torch.abs(scaled_A)
+        for key, val in FP4_QUANT_TABLE.items():
+            out_uint8[abs_scaled_A > key] = val
+        out_uint8 += sign.to(torch.uint8) * 8
     if out_uint8.size(-1) % 2:
         out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
     out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
@@ -342,21 +361,21 @@ def quantize_4bit_impl(
             quant_type=quant_type,
         )
 
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and input_shape[0] % blocksize == 0:
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and input_shape[1] % blocksize == 0 and quant_type == "nf4":
         # lowp_mode: lowest precision for computation
         lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16
         state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
             out.reshape([input_shape[0], input_shape[1] // 2]),
             ipex_cpu.quantization.WoqWeightDtype.NF4,
-            input_shape, # weight shape
-            absmax.view(input_shape[0], input_shape[1] // blocksize), # scales
-            None, # zero_points
-            None, # bias
-            None, # g_idx
-            None, # batch_size
+            input_shape,  # weight shape
+            absmax.view(input_shape[0], input_shape[1] // blocksize),  # scales
+            None,  # zero_points
+            None,  # bias
+            None,  # g_idx
+            None,  # batch_size
             blocksize,
             int(lowp_mode),
-            -1, # act_quant_mode. -1 means don't quant activation
+            -1,  # act_quant_mode. -1 means don't quant activation
         )
 
     return out, state
@@ -365,7 +384,7 @@ def quantize_4bit_impl(
 @_maybe_torch_compile
 def dequantize_4bit_impl(
     A: Tensor,
-    quant_state = None,
+    quant_state=None,
     absmax: Tensor = None,
     out: Tensor = None,
     blocksize: int = 64,
@@ -412,7 +431,7 @@ def dequantize_4bit_impl(
     else:
         absmax = quant_state.absmax
 
-    if quant_state.quant_type != "nf4":
+    if quant_type not in ["nf4", "fp4"]:
         raise NotImplementedError(
             f"4-bit quantization data type {quant_state.quant_type} is not implemented for CPU/XPU."
         )
@@ -421,9 +440,7 @@ def dequantize_4bit_impl(
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
     if out is None:
-        out = torch.empty(
-            quant_state.shape, dtype=quant_state.dtype, device=A.device
-        )
+        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
 
     n = out.numel()
     # Map nf4 to [-1, 1]
@@ -443,9 +460,11 @@ def dequantize_4bit_impl(
     rem = n % blocksize
     has_rem = rem > 0
     out_reshaped = out.reshape(-1)
-    out_reshaped[:n - rem] = (out_dq[:n - rem].view(-1, blocksize) * absmax[:blocks - has_rem].view(-1, 1)).reshape(-1)
+    out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
+        -1
+    )
     if has_rem:
-        out_reshaped[n - rem:] = out_dq[n - rem:] * absmax[-1]
+        out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
 
     # take transpose here because weight is transposed (again) for computation
     return out.t()
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 8e125f712..ea15f148a 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -2003,7 +2003,8 @@ def test_bench_dequantization():
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
 @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
 @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
-def test_4bit_quant(dtype, quant_type, blocksize):
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+def test_4bit_quant(dtype, quant_type, blocksize, device):
     vals = list(product([0, 1], repeat=4))
 
     code = {}
@@ -2027,9 +2028,11 @@ def test_4bit_quant(dtype, quant_type, blocksize):
             result = sign * exp * frac
         code[idx] = result
 
-    A1 = torch.randn(1024, 1024, device="cuda", dtype=dtype)
+    A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
     qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
     A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
+    if device == "cpu":
+        A2 = A2.t()
 
     err = (A1 - A2).abs().float()
     relerr = (err / (A1.abs().float() + 1e-8)).mean()
@@ -2279,6 +2282,49 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
             assert maxratio < 1.02 and maxratio > 0.98
 
 
+@pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
+def test_gemv_4bit_cpu(dtype, quant_type, kind):
+    """
+    Test 4bit GEMV for CPU. It is simplified a lot from the cuda version, since
+    the CPU backend does not support double_quant or quant_storage other than uint8.
+    Also, the CPU backend has different numeric accuracy from that of CUDA
+    """
+    for dim in [128, 256, 512, 1024]:
+        for i in range(10):
+            if kind == "fc1":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim * 4, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "fc2":
+                A = torch.randn(1, 4 * dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim, 4 * dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "attn":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "attn_packed":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim * 3, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+
+            qB, state = F.quantize_4bit(
+                B,
+                quant_type=quant_type,
+                compress_statistics=False,
+                quant_storage=torch.uint8,
+            )
+            dqB = F.dequantize_4bit(qB, state)
+            C3 = torch.matmul(A, dqB)
+            C2 = F.gemv_4bit(A, qB.t(), state=state)
+            A.requires_grad = True
+            C1 = bnb.matmul_4bit(A, qB.t(), state)
+
+            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
+            rtol = 1e-3 if dtype != torch.bfloat16 else 1e-2
+            atol = 1e-2 if dtype != torch.bfloat16 else 5e-2
+            assert_all_approx_close(C1, C2, rtol, atol, count=c)
+            assert_all_approx_close(C3, C2, rtol, atol, count=c)
+
+
 @pytest.mark.skip("Row scale has some bugs for ampere")
 def test_managed():
     n = 32 * 10

From dd15734709f131b4c1e3244ba28e632dbf5a3ed6 Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Fri, 10 May 2024 23:57:25 -0700
Subject: [PATCH 08/76] Reduce memory usage

---
 bitsandbytes/backends/cpu_xpu_common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 8d87f7e2f..426d07975 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -377,6 +377,7 @@ def quantize_4bit_impl(
             int(lowp_mode),
             -1,  # act_quant_mode. -1 means don't quant activation
         )
+        return torch.Tensor(), state
 
     return out, state
 

From 85a01b00fc131a586dec8fec5d25d753a471006c Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Sat, 11 May 2024 00:42:31 -0700
Subject: [PATCH 09/76] Fix UT

---
 bitsandbytes/backends/cpu_xpu_common.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 426d07975..7c35a85c3 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -440,6 +440,11 @@ def dequantize_4bit_impl(
     if quant_state.nested:
         raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
 
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(quant_state, "op_context"):
+        assert quant_state.op_context is not None
+        A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
+        A = A.reshape(-1)
+
     if out is None:
         out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
 
@@ -503,7 +508,7 @@ def gemm_4bit_impl(
     torch.Tensor:
         GEMM output tensor.
     """
-    if ipex_cpu and _ipex_cpu_version_prereq(2, 2) and hasattr(state, "op_context"):
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(state, "op_context"):
         assert state.op_context is not None
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
     else:

From 2c489f8dde8e5992af5aa0956e1a4cb9554b72eb Mon Sep 17 00:00:00 2001
From: "Xia, Weiwen" <weiwen.xia@intel.com>
Date: Sat, 11 May 2024 00:54:17 -0700
Subject: [PATCH 10/76] reduce memory usage for nf4

---
 bitsandbytes/backends/cpu_xpu_common.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 7c35a85c3..138ec72f5 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -377,6 +377,7 @@ def quantize_4bit_impl(
             int(lowp_mode),
             -1,  # act_quant_mode. -1 means don't quant activation
         )
+        state.absmax = torch.Tensor()
         return torch.Tensor(), state
 
     return out, state
@@ -444,6 +445,7 @@ def dequantize_4bit_impl(
         assert quant_state.op_context is not None
         A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
         A = A.reshape(-1)
+        absmax = quant_state.op_context.get_scales().reshape(-1)
 
     if out is None:
         out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)

From 13c70d305803ca9804958dc097919e02c41102ee Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Mon, 29 Apr 2024 11:43:08 -0700
Subject: [PATCH 11/76] clarify

---
 docs/source/fsdp_qlora.md | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/docs/source/fsdp_qlora.md b/docs/source/fsdp_qlora.md
index 47922cfcc..648e79092 100644
--- a/docs/source/fsdp_qlora.md
+++ b/docs/source/fsdp_qlora.md
@@ -9,21 +9,26 @@ This guide provides a brief guide on how bitsandbytes supports storing quantized
 
 ## Quantized data storage
 
-FSDP only supports sharding float data types which can be problematic because quantized weights are typically stored as integer data types (uint8). bitsandbytes doesn't have this problem because it uses `StoreChar` to read and write quantized weights regardless of the data type storage. This makes it simple to add a `quant_storage` parameter to the [`~nn.Linear4bit`] and [`~nn.Params4bit`] classes and set it to `torch.uint8` to maintain backward compatibility with the codebase.
+FSDP only supports sharding float data types which can be problematic because quantized weights are typically stored as integer data types (uint8). bitsandbytes doesn't have this problem because it uses `StoreChar` to read and write quantized weights regardless of the data type storage. This makes it simple to add a `quant_storage` parameter to the [`~nn.Linear4bit`] and [`~nn.Params4bit`] classes and set it to `torch.uint8` to maintain backward compatibility with the codebase. With the `quant_storage` parameter, you can select any of the FSDP supported data types to shard [`~nn.Linear4bit`] with such as bfloat16, float16 or float32.
+
+For example, you can configure this option in [`transformers.BitsAndBytesConfig`] by setting the `bnb_4bit_quant_storage` parameter.
 
 ```py
-import torch
-import bitsandbytes as bnb
-
-model = bnb.nn.Linear4bit(
-    input_features,
-    output_features,
-    quant_type="fp4",
-    quant_storage=torch.uint8,
+from transformers import BitsAndBytesConfig, AutoModelForCausalLM
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_storage=torch.bfloat16,
 )
-```
 
-With the `quant_storage` parameter, you can select any of the FSDP supported data types to shard [`~nn.Linear4bit`] with such as bfloat16, float16 or float32.
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-70b",
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16,
+)
+```
 
 ## Training
 

From 2b7daed3db413824f47a0fdfcc0e5353669eebd0 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Tue, 14 May 2024 10:55:35 -0700
Subject: [PATCH 12/76] clarify

---
 docs/source/fsdp_qlora.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/source/fsdp_qlora.md b/docs/source/fsdp_qlora.md
index 648e79092..c623fa424 100644
--- a/docs/source/fsdp_qlora.md
+++ b/docs/source/fsdp_qlora.md
@@ -11,7 +11,10 @@ This guide provides a brief guide on how bitsandbytes supports storing quantized
 
 FSDP only supports sharding float data types which can be problematic because quantized weights are typically stored as integer data types (uint8). bitsandbytes doesn't have this problem because it uses `StoreChar` to read and write quantized weights regardless of the data type storage. This makes it simple to add a `quant_storage` parameter to the [`~nn.Linear4bit`] and [`~nn.Params4bit`] classes and set it to `torch.uint8` to maintain backward compatibility with the codebase. With the `quant_storage` parameter, you can select any of the FSDP supported data types to shard [`~nn.Linear4bit`] with such as bfloat16, float16 or float32.
 
-For example, you can configure this option in [`transformers.BitsAndBytesConfig`] by setting the `bnb_4bit_quant_storage` parameter.
+You'll typically access and configure this option from [`transformers.BitsAndBytesConfig`] by setting the `bnb_4bit_quant_storage` parameter. It is very **important** the `quant_storage` data type matches the data types used throughout the model because FSDP can only wrap layers and modules that have the *same floating data type*. Making sure the data types are aligned will ensure the model is correctly sharded.
+
+> [!TIP]
+> The `compute_dtype` is the data type used for computation inside the CUDA kernel, where the 4-bit quantized weights are unpacked from the data type in `quant_storage` and dequantized to `compute_dtype`. We recommend using torch.bfloat16 (if available on your hardware) for better numerical stability.
 
 ```py
 from transformers import BitsAndBytesConfig, AutoModelForCausalLM
@@ -30,9 +33,11 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 ```
 
+Check out this [section](https://hf.co/docs/peft/main/en/accelerate/fsdp#use-peft-qlora-and-fsdp-for-finetuning-large-models-on-multiple-gpus) of the PEFT documentation for the config file and training code to run FSDP-QLoRA training.
+
 ## Training
 
-bitsandbytes is deeply integrated with the Hugging Face ecosystem, making it easy to use with libraries like [Transformers](https://hf/co/docs/transformers), [PEFT](https://hf/co/docs/peft), and [TRL](https://hf/co/docs/trl).
+bitsandbytes is deeply integrated with the Hugging Face ecosystem, making it easy to use with libraries like [Transformers](https://hf.co/docs/transformers), [PEFT](https://hf.co/docs/peft), and [TRL](https://hf.co/docs/trl).
 
 Before you begin, make sure you have the latest libraries installed.
 

From d7a5a244fb3b277284788a9b54deebf2ab8d2a75 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Thu, 16 May 2024 12:52:11 -0700
Subject: [PATCH 13/76] feedback

---
 docs/source/fsdp_qlora.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/fsdp_qlora.md b/docs/source/fsdp_qlora.md
index c623fa424..11e169ffb 100644
--- a/docs/source/fsdp_qlora.md
+++ b/docs/source/fsdp_qlora.md
@@ -37,17 +37,19 @@ Check out this [section](https://hf.co/docs/peft/main/en/accelerate/fsdp#use-pef
 
 ## Training
 
+> [!TIP]
+> FSDP is a distributed training framework that needs to be launched as a distributed training job with a library like [Accelerate](https://hf.co/docs/accelerate/index) or [torchrun](https://pytorch.org/docs/stable/elastic/run.html). The launch command provided in this section uses Accelerate to launch the training script.
+
 bitsandbytes is deeply integrated with the Hugging Face ecosystem, making it easy to use with libraries like [Transformers](https://hf.co/docs/transformers), [PEFT](https://hf.co/docs/peft), and [TRL](https://hf.co/docs/trl).
 
+PEFT provides a configuration file ([fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml)), launch command ([run_peft_qlora_fsdp.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_fsdp.sh)), and training script ([train.py](https://github.com/huggingface/peft/blob/main/examples/sft/train.py)) for running FSDP-QLoRA. To learn more, check out the [Use PEFT QLoRA and FSDP for finetuning large models on multiple GPUs](https://huggingface.co/docs/peft/main/en/accelerate/fsdp#use-peft-qlora-and-fsdp-for-finetuning-large-models-on-multiple-gpus) documentation. This section briefly covers the steps to run FSDP-QLoRA training.
+
 Before you begin, make sure you have the latest libraries installed.
 
 ```bash
 pip install -U bitsandbytes accelerate transformers peft trl
 ```
 
-> [!TIP]
-> PEFT provides a configuration file ([fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml)), launch command ([run_peft_qlora_fsdp.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_fsdp.sh)), and training script ([train.py](https://github.com/huggingface/peft/blob/main/examples/sft/train.py)) for FSDP-QLoRA. To learn more, check out the [Use PEFT QLoRA and FSDP for finetuning large models on multiple GPUs](https://huggingface.co/docs/peft/main/en/accelerate/fsdp#use-peft-qlora-and-fsdp-for-finetuning-large-models-on-multiple-gpus) documentation.
-
 The important change that enables FSDP-QLoRA training is the `bnb_4bit_quant_storage` parameter in the [`~transformers.BitsAndBytesConfig`] class. This allows you to set the storage data type of the quantized weights to a float data type.
 
 ```py

From c51437bc3ec9490dfdd2b315461973b8eb366c7a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 20 May 2024 14:30:57 +0000
Subject: [PATCH 14/76] Update matplotlib requirement from ~=3.8.4 to ~=3.9.0
 in the major group

Updates the requirements on [matplotlib](https://github.com/matplotlib/matplotlib) to permit the latest version.

Updates `matplotlib` to 3.9.0
- [Release notes](https://github.com/matplotlib/matplotlib/releases)
- [Commits](https://github.com/matplotlib/matplotlib/compare/v3.8.4...v3.9.0)

---
updated-dependencies:
- dependency-name: matplotlib
  dependency-type: direct:development
  dependency-group: major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0334896be..16e80cccb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,4 +6,4 @@ wheel~=0.43.0
 lion-pytorch~=0.1.4
 scipy~=1.13.0
 pandas~=2.2.2
-matplotlib~=3.8.4
+matplotlib~=3.9.0

From fa65a9dcc0514819388545e5295a10b67ad950c1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 20 May 2024 14:31:36 +0000
Subject: [PATCH 15/76] Bump pytest from 8.2.0 to 8.2.1 in the minor-patch
 group

Bumps the minor-patch group with 1 update: [pytest](https://github.com/pytest-dev/pytest).


Updates `pytest` from 8.2.0 to 8.2.1
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.2.0...8.2.1)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 24e2db324..c3faf7522 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,5 +1,5 @@
 # Requirements used for GitHub actions
-pytest==8.2.0
+pytest==8.2.1
 einops==0.8.0
 lion-pytorch==0.1.4
 scipy==1.10.1; python_version < "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0334896be..56f7da762 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.0
+pytest~=8.2.1
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.1.4

From 79815ad4f0a2e5df01515b9890aee7f0d93ee688 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 24 May 2024 16:50:03 +0200
Subject: [PATCH 16/76] README: ask for help from volunteer alpha testers

---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 3f40b1e0e..7823168ac 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,16 @@ There are ongoing efforts to support further hardware backends, i.e. Intel CPU +
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 
+## ALPHA TESTERS WANTED: `multi-backend-refactor` AMD GPU + Intel CPU/GPU specific BNB backend implementations
+
+We're in the process of a complex refactor in order to allow the support of additional hardware backends, other than CUDA, in BNB. The efforts around this are already quite far along and there's plenty of functionality already in place that is in need for users to take a hands-on approach! Mac support will likely soon also see progress. However, I recommend waiting 2 weeks until the device abstraction has further consolidated (**breaking changes upcoming**).
+
+Currently, you still need to compile from source, after checking out the `multi-backend-refactor` branch (instructions WIP, but [the current docs on the compilation from source](https://huggingface.co/docs/bitsandbytes/main/en/installation#compile-from-source) are a good starting point; [feel free to share tips / input in this Github discussion](https://github.com/TimDettmers/bitsandbytes/discussions/1219). We'll soon enable nightly releases to make this much easier for you!
+
+Please give feedback to us in [this dedicated Github Discussion space](https://github.com/TimDettmers/bitsandbytes/discussions/categories/catch-all-alpha-testing-the-multi-backend-refactor)!
+
+We're super excited about these recent developments and grateful for any constructive input or support that you can give to help us make this a reality. BNB is a community project and we're excited for your collaboration 🤗
+
 ## License
 
 `bitsandbytes` is MIT licensed.

From a9a1c44abaf671f06794f6664fe3ca47bcade5e5 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Sun, 26 May 2024 15:43:24 +0200
Subject: [PATCH 17/76] Add `"lamb"` to `str2optimizer32bit`

---
 bitsandbytes/functional.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index f915223ca..dc1490482 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -49,6 +49,10 @@ def prod(iterable):
             lib.cadagrad32bit_grad_32,
             lib.cadagrad32bit_grad_16,
         ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            ),
     }
 
     str2optimizer8bit = {

From ccee5d894b8aed4c24976a5622140a930d1f5574 Mon Sep 17 00:00:00 2001
From: statelesshz <hzji210@gmail.com>
Date: Mon, 27 May 2024 14:22:54 +0800
Subject: [PATCH 18/76] Add empty stubs for Ascend NPU

---
 bitsandbytes/__init__.py     |   6 +-
 bitsandbytes/backends/npu.py | 170 +++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+), 1 deletion(-)
 create mode 100644 bitsandbytes/backends/npu.py

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 760a8eda4..eff7fc686 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -16,6 +16,7 @@
 )
 from .backends import register_backend
 from .backends.cpu import CPUBackend
+from .backends.npu import NPUBackend
 from .cextension import lib
 from .nn import modules
 
@@ -49,11 +50,14 @@
 
     register_backend("xpu", XPUBackend())
 
+# Register Ascend NPU backend, if available.
+if hasattr(torch, "npu") and torch.npu.is_available():
+    register_backend("npu", NPUBackend())
+
 # TODO: Other potential backends:
 # XLA - Google TPU / PJRT runtime
 # HPU - Habana / Intel Gaudi
 # IPU - Graphcore
-# NPU - Ascend
 # Note that we may not map 1:1 with a device type, e.g. SYCL, XLA
 # In this case, it will be up to each backend to dispatch as needed
 
diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py
new file mode 100644
index 000000000..1b3cb57d6
--- /dev/null
+++ b/bitsandbytes/backends/npu.py
@@ -0,0 +1,170 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+try:
+    # to support Ascend NPU backend
+    import torch_npu  # noqa: F401
+except ImportError:
+    pass
+
+
+class NPUBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError

From a8644b7b86cc54055cc6138ce3de6229552d32b4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 May 2024 14:38:07 +0000
Subject: [PATCH 19/76] Bump scipy from 1.13.0 to 1.13.1 in the minor-patch
 group

Bumps the minor-patch group with 1 update: [scipy](https://github.com/scipy/scipy).


Updates `scipy` from 1.13.0 to 1.13.1
- [Release notes](https://github.com/scipy/scipy/releases)
- [Commits](https://github.com/scipy/scipy/compare/v1.13.0...v1.13.1)

---
updated-dependencies:
- dependency-name: scipy
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index c3faf7522..0e9dd2407 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -3,4 +3,4 @@ pytest==8.2.1
 einops==0.8.0
 lion-pytorch==0.1.4
 scipy==1.10.1; python_version < "3.9"
-scipy==1.13.0; python_version >= "3.9"
+scipy==1.13.1; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 75badf031..de7adce94 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,6 +4,6 @@ pytest~=8.2.1
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.1.4
-scipy~=1.13.0
+scipy~=1.13.1
 pandas~=2.2.2
 matplotlib~=3.9.0

From 2e46eefcb214cffc0fb9d6ace71f53924f9c7873 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Tue, 28 May 2024 18:35:31 +0200
Subject: [PATCH 20/76] Sorted alphabetically for better overview

---
 bitsandbytes/functional.py | 64 +++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index dc1490482..0b1e7d5c4 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,11 +27,24 @@ def prod(iterable):
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
+        ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+        ),
+        "lion": (
+            lib.clion32bit_grad_fp32,
+            lib.clion32bit_grad_fp16,
+            lib.clion32bit_grad_bf16,
+        ),
         "momentum": (
             lib.cmomentum32bit_grad_32,
             lib.cmomentum32bit_grad_16,
@@ -40,19 +53,6 @@ def prod(iterable):
             lib.crmsprop32bit_grad_32,
             lib.crmsprop32bit_grad_16,
         ),
-        "lion": (
-            lib.clion32bit_grad_fp32,
-            lib.clion32bit_grad_fp16,
-            lib.clion32bit_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
-        ),
-        "lamb": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            ),
     }
 
     str2optimizer8bit = {
@@ -60,34 +60,43 @@ def prod(iterable):
             lib.cadam_static_8bit_grad_32,
             lib.cadam_static_8bit_grad_16,
         ),
-        "momentum": (
+        "lamb": (
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
+        ),
+        "lars": (
             lib.cmomentum_static_8bit_grad_32,
             lib.cmomentum_static_8bit_grad_16,
         ),
-        "rmsprop": (
-            lib.crmsprop_static_8bit_grad_32,
-            lib.crmsprop_static_8bit_grad_16,
-        ),
         "lion": (
             lib.clion_static_8bit_grad_32,
             lib.clion_static_8bit_grad_16,
         ),
-        "lamb": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
-        ),
-        "lars": (
+        "momentum": (
             lib.cmomentum_static_8bit_grad_32,
             lib.cmomentum_static_8bit_grad_16,
         ),
+        "rmsprop": (
+            lib.crmsprop_static_8bit_grad_32,
+            lib.crmsprop_static_8bit_grad_16,
+        ),
     }
 
     str2optimizer8bit_blockwise = {
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
         "adam": (
             lib.cadam_8bit_blockwise_grad_fp32,
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
         "momentum": (
             lib.cmomentum_8bit_blockwise_grad_fp32,
             lib.cmomentum_8bit_blockwise_grad_fp16,
@@ -96,15 +105,6 @@ def prod(iterable):
             lib.crmsprop_8bit_blockwise_grad_fp32,
             lib.crmsprop_8bit_blockwise_grad_fp16,
         ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
     }
 
 
From 7a338db2eccbd60b7da3b7bed9c927117c6b3806 Mon Sep 17 00:00:00 2001
From: EtienneDosSantos <130935112+EtienneDosSantos@users.noreply.github.com>
Date: Tue, 28 May 2024 19:53:57 +0200
Subject: [PATCH 21/76] Update functional.py

---
 bitsandbytes/functional.py | 88 ++++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 18 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 0b1e7d5c4..bbfbf0007 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -28,57 +28,94 @@ def prod(iterable):
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
         "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
+            lib.cadagrad32bit_grad_fp32,
+            lib.cadagrad32bit_grad_fp16,
         ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
+        "pagedadam": (
+            lib.cpagedadam32bit_grad_fp32,
+            lib.cpagedadam32bit_grad_fp16,
+            lib.cpagedadam32bit_grad_bf16,
+        ),
+        "adamw": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            lib.cadam32bit_grad_bf16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam32bit_grad_fp32,
+            lib.cpagedadam32bit_grad_fp16,
+            lib.cpagedadam32bit_grad_bf16,
+        ),
         "lamb": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
         ),
+        "lars": (
+            lib.clars32bit_grad_fp32,
+            lib.clars32bit_grad_fp16,
+        ),
         "lion": (
             lib.clion32bit_grad_fp32,
             lib.clion32bit_grad_fp16,
             lib.clion32bit_grad_bf16,
         ),
         "momentum": (
-            lib.cmomentum32bit_grad_32,
-            lib.cmomentum32bit_grad_16,
+            lib.cmomentum32bit_grad_fp32,
+            lib.cmomentum32bit_grad_fp16,
         ),
         "rmsprop": (
-            lib.crmsprop32bit_grad_32,
-            lib.crmsprop32bit_grad_16,
+            lib.crmsprop32bit_grad_fp32,
+            lib.crmsprop32bit_grad_fp16,
         ),
     }
 
     str2optimizer8bit = {
+        "adagrad": (
+            lib.cadagrad8bit_grad_fp32,
+            lib.cadagrad8bit_grad_fp16,
+        ),
         "adam": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
+        ),
+        "pagedadam": (
+            lib.cpagedadam8bit_grad_fp32,
+            lib.cpagedadam8bit_grad_fp16,
+            lib.cpagedadam8bit_grad_bf16,
+        ),
+        "adamw": (
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam8bit_grad_fp32,
+            lib.cpagedadam8bit_grad_fp16,
+            lib.cpagedadam8bit_grad_bf16,
         ),
         "lamb": (
-            lib.cadam_static_8bit_grad_32,
-            lib.cadam_static_8bit_grad_16,
+            lib.cadam_static_8bit_grad_fp32,
+            lib.cadam_static_8bit_grad_fp16,
         ),
         "lars": (
-            lib.cmomentum_static_8bit_grad_32,
-            lib.cmomentum_static_8bit_grad_16,
+            lib.clars8bit_grad_fp32,
+            lib.clars8bit_grad_fp16,
         ),
         "lion": (
-            lib.clion_static_8bit_grad_32,
-            lib.clion_static_8bit_grad_16,
+            lib.clion_static_8bit_grad_fp32,
+            lib.clion_static_8bit_grad_fp16,
         ),
         "momentum": (
-            lib.cmomentum_static_8bit_grad_32,
-            lib.cmomentum_static_8bit_grad_16,
+            lib.cmomentum_static_8bit_grad_fp32,
+            lib.cmomentum_static_8bit_grad_fp16,
         ),
         "rmsprop": (
-            lib.crmsprop_static_8bit_grad_32,
-            lib.crmsprop_static_8bit_grad_16,
+            lib.crmsprop_static_8bit_grad_fp32,
+            lib.crmsprop_static_8bit_grad_fp16,
         ),
     }
 
@@ -92,6 +129,21 @@ def prod(iterable):
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
+        "pagedadam": (
+            lib.cpagedadam8bit_blockwise_fp32,
+            lib.cpagedadam8bit_blockwise_fp16,
+            lib.cpagedadam8bit_blockwise_bf16,
+        ),
+        "adamw": (
+            lib.cadam_8bit_blockwise_grad_fp32,
+            lib.cadam_8bit_blockwise_grad_fp16,
+            lib.cadam_8bit_blockwise_grad_bf16,
+        ),
+        "pagedadamw": (
+            lib.cpagedadam8bit_blockwise_fp32,
+            lib.cpagedadam8bit_blockwise_fp16,
+            lib.cpagedadam8bit_blockwise_bf16,
+        ),
         "lion": (
             lib.clion_8bit_blockwise_grad_fp32,
             lib.clion_8bit_blockwise_grad_fp16,

From 2fb212bdf983891451db73a4c4bef6c91ec0786d Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Wed, 29 May 2024 15:36:44 +0200
Subject: [PATCH 22/76] FIX Prevent __getstate__ from mutating Params4bit

As discussed internally, use state = self.__dict__.copy(), which is also
what the Python docs recommend.
---
 bitsandbytes/nn/modules.py |  2 +-
 tests/test_linear4bit.py   | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 24a155ab1..df347ebba 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -236,7 +236,7 @@ def __new__(
         return self
 
     def __getstate__(self):
-        state = self.__dict__
+        state = self.__dict__.copy()
         state["data"] = self.data
         state["requires_grad"] = self.requires_grad
         return state
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index bbbd05335..2f094be27 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -186,19 +186,30 @@ def test_copy_param():
 def test_deepcopy_param():
     tensor = torch.tensor([1.0, 2.0, 3.0, 4.0])
     param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0)
+    dict_keys_before = set(param.__dict__.keys())
     copy_param = copy.deepcopy(param)
+    dict_keys_after = set(param.__dict__.keys())
+    dict_keys_copy = set(copy_param.__dict__.keys())
+
     assert param.quant_state is not copy_param.quant_state
     assert param.data.data_ptr() != copy_param.data.data_ptr()
 
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_copy
+
 
 def test_params4bit_real_serialization():
     original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)
     original_param = bnb.nn.Params4bit(data=original_tensor, quant_type="fp4")
+    dict_keys_before = set(original_param.__dict__.keys())
 
     original_param.cuda(0)  # move to CUDA to trigger quantization
 
     serialized_param = pickle.dumps(original_param)
     deserialized_param = pickle.loads(serialized_param)
+    dict_keys_after = set(original_param.__dict__.keys())
+    dict_keys_deserialized = set(deserialized_param.__dict__.keys())
 
     assert torch.equal(original_param.data, deserialized_param.data)
     assert original_param.requires_grad == deserialized_param.requires_grad == False
@@ -206,3 +217,7 @@ def test_params4bit_real_serialization():
     assert original_param.blocksize == deserialized_param.blocksize
     assert original_param.compress_statistics == deserialized_param.compress_statistics
     assert original_param.quant_state == deserialized_param.quant_state
+
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_deserialized

From 36fe1a0cc768686cb6e3d864573eae822509d21d Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 29 May 2024 13:58:37 -0400
Subject: [PATCH 23/76] fix blocksize

---
 bitsandbytes/backends/cpu.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index 2c3688251..5d38171d5 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -135,6 +135,8 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
+        if blocksize is None:
+            blocksize = 64
         assert_on_cpu([A, absmax, out])
         assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
         return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
@@ -148,6 +150,8 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
+        if blocksize is None:
+            blocksize = 64
         assert_on_cpu([A, absmax, out])
         return dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
 

From ed99b3c118d73e829e7581c04a414b80ee2b7030 Mon Sep 17 00:00:00 2001
From: Benjamin Bossan <benjamin.bossan@gmail.com>
Date: Thu, 30 May 2024 17:03:48 +0200
Subject: [PATCH 24/76] FIX Make Int8Params deepcopy-able

This requires to implement the __deepcopy__ method in Int8Params.
Moreover, there was an issue in the Linear8BitLT constructor that would
assign instance attributes to the class, which is now fixed.

Please review carefully that this does not impact existing code.

Tests that I ran:

- pytest tests/test_linear8bitlt.py
- in PEFT: python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_gpu_examples.py
- in PEFT: python -m pytest -m "single_gpu_tests and bitsandbytes" tests/test_common_gpu.py
- in transformers: RUN_SLOW=1 python -m pytest tests/quantization/bnb -x
---
 bitsandbytes/nn/modules.py | 19 ++++++++++---
 tests/test_linear8bitlt.py | 58 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 24a155ab1..e44c77ac6 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -560,13 +560,12 @@ def __new__(
         CB=None,
         SCB=None,
     ):
-        cls.has_fp16_weights = has_fp16_weights
-        cls.CB = None
-        cls.SCB = None
         if data is None:
             data = torch.empty(0)
         obj = torch.Tensor._make_subclass(cls, data, requires_grad)
-        obj.CB, obj.SCB = cls.CB, cls.SCB
+        obj.CB = CB
+        obj.SCB = SCB
+        obj.has_fp16_weights = has_fp16_weights
         return obj
 
     def cuda(self, device):
@@ -585,6 +584,18 @@ def cuda(self, device):
 
         return self
 
+    def __deepcopy__(self, memo):
+        # adjust this if new arguments are added to the constructor
+        new_instance = type(self).__new__(
+            type(self),
+            data=copy.deepcopy(self.data, memo),
+            requires_grad=self.requires_grad,
+            has_fp16_weights=self.has_fp16_weights,
+            CB=copy.deepcopy(self.CB, memo),
+            SCB=copy.deepcopy(self.SCB, memo),
+        )
+        return new_instance
+
     @overload
     def to(
         self: T,
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 4b62abd6d..e55abe110 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -1,5 +1,7 @@
 from contextlib import nullcontext
+import copy
 import os
+import pickle
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -177,3 +179,59 @@ def test_linear_serialization(
         assert torch.allclose(x_first.grad, x_second.grad, atol=1e-5)
     assert torch.allclose(fx_first, fx_third, atol=1e-5)
     assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5)
+
+
+@pytest.fixture
+def linear8bit():
+    linear = torch.nn.Linear(32, 96)
+    linear_custom = Linear8bitLt(
+        linear.in_features,
+        linear.out_features,
+        linear.bias is not None,
+        has_fp16_weights=False,
+        threshold=6.0,
+    )
+    linear_custom.weight = bnb.nn.Int8Params(
+        linear.weight.data.clone(),
+        requires_grad=False,
+        has_fp16_weights=False,
+    )
+    linear_custom.bias = linear.bias
+    linear_custom = linear_custom.cuda()
+    return linear_custom
+
+
+def test_linear8bit_copy_param(linear8bit):
+    shallow_copy = copy.copy(linear8bit)
+    assert linear8bit.weight is shallow_copy.weight
+    assert linear8bit.bias is shallow_copy.bias
+    assert linear8bit.weight.data.data_ptr() == shallow_copy.weight.data.data_ptr()
+
+
+def test_linear8bit_deepcopy_param(linear8bit):
+    deep_copy = copy.deepcopy(linear8bit)
+    assert linear8bit.weight is not deep_copy.weight
+    assert linear8bit.bias is not deep_copy.bias
+    assert linear8bit.weight.data.data_ptr() != deep_copy.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deep_copy.weight.data)
+    assert linear8bit.state == deep_copy.state
+
+    # check for a bug where SCB and CB were not copied
+    assert deep_copy.weight.SCB is not None
+    assert (linear8bit.weight.SCB == deep_copy.weight.SCB).all()
+    assert deep_copy.weight.CB is not None
+    assert (linear8bit.weight.CB == deep_copy.weight.CB).all()
+
+
+def test_linear8bit_serialization(linear8bit):
+    serialized = pickle.dumps(linear8bit)
+    deserialized = pickle.loads(serialized)
+    assert linear8bit.weight.data.data_ptr() != deserialized.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deserialized.weight.data)
+    assert linear8bit.bias.data.data_ptr() != deserialized.bias.data.data_ptr()
+    assert torch.allclose(linear8bit.bias.data, deserialized.bias.data)
+    assert linear8bit.state == deserialized.state
+
+    # check for a bug where SCB and CB were not copied
+    assert (linear8bit.weight.SCB == deserialized.weight.SCB).all()
+    assert (linear8bit.weight.CB == deserialized.weight.CB).all()

From b22ae26d4c9f0bec6b85987423737d447a78c387 Mon Sep 17 00:00:00 2001
From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Date: Wed, 5 Jun 2024 14:36:37 +0200
Subject: [PATCH 25/76] fix for faulty #1222 ("Add `"lamb"` to
 `str2optimizer32bit`") (#1240)

* Revert "Add `"lamb"` to `str2optimizer32bit`"

* Update bitsandbytes/functional.py
---
 bitsandbytes/functional.py | 124 +++++++++++--------------------------
 1 file changed, 36 insertions(+), 88 deletions(-)

diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index bbfbf0007..cea3179a1 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,128 +27,67 @@ def prod(iterable):
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
     str2optimizer32bit = {
-        "adagrad": (
-            lib.cadagrad32bit_grad_fp32,
-            lib.cadagrad32bit_grad_fp16,
-        ),
         "adam": (
             lib.cadam32bit_grad_fp32,
             lib.cadam32bit_grad_fp16,
             lib.cadam32bit_grad_bf16,
         ),
-        "pagedadam": (
-            lib.cpagedadam32bit_grad_fp32,
-            lib.cpagedadam32bit_grad_fp16,
-            lib.cpagedadam32bit_grad_bf16,
-        ),
-        "adamw": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            lib.cadam32bit_grad_bf16,
-        ),
-        "pagedadamw": (
-            lib.cpagedadam32bit_grad_fp32,
-            lib.cpagedadam32bit_grad_fp16,
-            lib.cpagedadam32bit_grad_bf16,
-        ),
-        "lamb": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
+        "momentum": (
+            lib.cmomentum32bit_grad_32,
+            lib.cmomentum32bit_grad_16,
         ),
-        "lars": (
-            lib.clars32bit_grad_fp32,
-            lib.clars32bit_grad_fp16,
+        "rmsprop": (
+            lib.crmsprop32bit_grad_32,
+            lib.crmsprop32bit_grad_16,
         ),
         "lion": (
             lib.clion32bit_grad_fp32,
             lib.clion32bit_grad_fp16,
             lib.clion32bit_grad_bf16,
         ),
-        "momentum": (
-            lib.cmomentum32bit_grad_fp32,
-            lib.cmomentum32bit_grad_fp16,
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
         ),
-        "rmsprop": (
-            lib.crmsprop32bit_grad_fp32,
-            lib.crmsprop32bit_grad_fp16,
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
         ),
     }
 
     str2optimizer8bit = {
-        "adagrad": (
-            lib.cadagrad8bit_grad_fp32,
-            lib.cadagrad8bit_grad_fp16,
-        ),
         "adam": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
         ),
-        "pagedadam": (
-            lib.cpagedadam8bit_grad_fp32,
-            lib.cpagedadam8bit_grad_fp16,
-            lib.cpagedadam8bit_grad_bf16,
+        "momentum": (
+            lib.cmomentum_static_8bit_grad_32,
+            lib.cmomentum_static_8bit_grad_16,
         ),
-        "adamw": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+        "rmsprop": (
+            lib.crmsprop_static_8bit_grad_32,
+            lib.crmsprop_static_8bit_grad_16,
         ),
-        "pagedadamw": (
-            lib.cpagedadam8bit_grad_fp32,
-            lib.cpagedadam8bit_grad_fp16,
-            lib.cpagedadam8bit_grad_bf16,
+        "lion": (
+            lib.clion_static_8bit_grad_32,
+            lib.clion_static_8bit_grad_16,
         ),
         "lamb": (
-            lib.cadam_static_8bit_grad_fp32,
-            lib.cadam_static_8bit_grad_fp16,
+            lib.cadam_static_8bit_grad_32,
+            lib.cadam_static_8bit_grad_16,
         ),
         "lars": (
-            lib.clars8bit_grad_fp32,
-            lib.clars8bit_grad_fp16,
-        ),
-        "lion": (
-            lib.clion_static_8bit_grad_fp32,
-            lib.clion_static_8bit_grad_fp16,
-        ),
-        "momentum": (
-            lib.cmomentum_static_8bit_grad_fp32,
-            lib.cmomentum_static_8bit_grad_fp16,
-        ),
-        "rmsprop": (
-            lib.crmsprop_static_8bit_grad_fp32,
-            lib.crmsprop_static_8bit_grad_fp16,
+            lib.cmomentum_static_8bit_grad_32,
+            lib.cmomentum_static_8bit_grad_16,
         ),
     }
 
     str2optimizer8bit_blockwise = {
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
         "adam": (
             lib.cadam_8bit_blockwise_grad_fp32,
             lib.cadam_8bit_blockwise_grad_fp16,
             lib.cadam_8bit_blockwise_grad_bf16,
         ),
-        "pagedadam": (
-            lib.cpagedadam8bit_blockwise_fp32,
-            lib.cpagedadam8bit_blockwise_fp16,
-            lib.cpagedadam8bit_blockwise_bf16,
-        ),
-        "adamw": (
-            lib.cadam_8bit_blockwise_grad_fp32,
-            lib.cadam_8bit_blockwise_grad_fp16,
-            lib.cadam_8bit_blockwise_grad_bf16,
-        ),
-        "pagedadamw": (
-            lib.cpagedadam8bit_blockwise_fp32,
-            lib.cpagedadam8bit_blockwise_fp16,
-            lib.cpagedadam8bit_blockwise_bf16,
-        ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
         "momentum": (
             lib.cmomentum_8bit_blockwise_grad_fp32,
             lib.cmomentum_8bit_blockwise_grad_fp16,
@@ -157,6 +96,15 @@ def prod(iterable):
             lib.crmsprop_8bit_blockwise_grad_fp32,
             lib.crmsprop_8bit_blockwise_grad_fp16,
         ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
     }
 
 
From 517eaf2b5b789033dab1cd85459057129e6a0b19 Mon Sep 17 00:00:00 2001
From: Xia Weiwen <xia.weiwen@hotmail.com>
Date: Thu, 6 Jun 2024 22:09:13 +0800
Subject: [PATCH 26/76] CPU: add torch.compile for F.double_quant and
 F.quantize_4bit (#1238)

---
 bitsandbytes/backends/cpu_xpu_common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 138ec72f5..396234853 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -55,7 +55,7 @@ def _maybe_torch_compile(func):
     return func
 
 
-# Don't use torch.compile for now due to PyTorch issue https://github.com/pytorch/pytorch/issues/124382
+@_maybe_torch_compile
 def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     """
     Find absolute max values of each row/column of a tensor, and symmetrically quantize it to int8.
@@ -268,7 +268,7 @@ def mm_dequant_impl(
 }
 
 
-# It's faster not to use torch.compile
+@_maybe_torch_compile
 def quantize_4bit_impl(
     A: Tensor,
     absmax: Tensor = None,

From 5891465f3908c0e938e5501586a51dbdaecbfa6a Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 19 Jun 2024 22:22:50 +0000
Subject: [PATCH 27/76] Add build job for rocm

---
 .github/workflows/python-package.yml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..78bc747c3 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -103,6 +103,28 @@ jobs:
           name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
           path: output/*
           retention-days: 7
+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        arch: [x86_64]
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        if: startsWith(matrix.os, 'ubuntu')
+        uses: docker/setup-qemu-action@v2
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
   build-wheels:
     needs:
       - build-shared-libs

From d03a680871b2d665ba9f420b7513cab35b0b6960 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Wed, 19 Jun 2024 22:23:08 +0000
Subject: [PATCH 28/76] Add rocm build script

---
 .github/scripts/build-rocm.sh | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .github/scripts/build-rocm.sh

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
new file mode 100644
index 000000000..fc7515aa7
--- /dev/null
+++ b/.github/scripts/build-rocm.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+
+set -xeuo pipefail
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:6.1-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake --build ."
+fi
+
+#output_dir="output/${build_os}/${build_arch}"
+#mkdir -p "${output_dir}"
+#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")

From ec9000f5444726589935ba8107249eddade9689d Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Thu, 20 Jun 2024 21:02:16 +0000
Subject: [PATCH 29/76] Copy shared obj file into output_dir

---
 .github/scripts/build-rocm.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index fc7515aa7..616e8c250 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -14,6 +14,6 @@ if [ "${build_os:0:6}" == ubuntu ]; then
       && cmake --build ."
 fi
 
-#output_dir="output/${build_os}/${build_arch}"
-#mkdir -p "${output_dir}"
-#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")

From 9b8c1da639c76c0fd41df00b835ab02b3508a64b Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Thu, 20 Jun 2024 21:02:50 +0000
Subject: [PATCH 30/76] upload build artifacts and enable wheels build

---
 .github/workflows/python-package.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 78bc747c3..3b243993b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -125,10 +125,17 @@ jobs:
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}
+          path: output/*
+          retention-days: 7
   build-wheels:
     needs:
       - build-shared-libs
       - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]

From 1413c5f3a2aed51140b86daa8ee9283c67cce738 Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Thu, 20 Jun 2024 16:10:54 -0500
Subject: [PATCH 31/76] Remove cuda build temporarily

---
 .github/workflows/python-package.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 3b243993b..0b0b35416 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -134,7 +134,6 @@ jobs:
   build-wheels:
     needs:
       - build-shared-libs
-      - build-shared-libs-cuda
       - build-shared-libs-rocm
     strategy:
       matrix:

From 195ae616d63906673c5025d67a785a2455787896 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 21 Jun 2024 18:36:12 +0200
Subject: [PATCH 32/76] Bump the minor-patch group across 1 directory with 2
 updates (#1253)

Bumps the minor-patch group with 2 updates in the / directory: [pytest](https://github.com/pytest-dev/pytest) and [lion-pytorch](https://github.com/lucidrains/lion-pytorch).


Updates `pytest` from 8.2.1 to 8.2.2
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.2.1...8.2.2)

Updates `lion-pytorch` from 0.1.4 to 0.2.2
- [Release notes](https://github.com/lucidrains/lion-pytorch/releases)
- [Commits](https://github.com/lucidrains/lion-pytorch/compare/0.1.4...0.2.2)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: minor-patch
- dependency-name: lion-pytorch
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 4 ++--
 requirements-dev.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 0e9dd2407..285b5e7d8 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,6 +1,6 @@
 # Requirements used for GitHub actions
-pytest==8.2.1
+pytest==8.2.2
 einops==0.8.0
-lion-pytorch==0.1.4
+lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
 scipy==1.13.1; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index de7adce94..80927a4cb 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,9 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.1
+pytest~=8.2.2
 einops~=0.8.0
 wheel~=0.43.0
-lion-pytorch~=0.1.4
+lion-pytorch~=0.2.2
 scipy~=1.13.1
 pandas~=2.2.2
 matplotlib~=3.9.0

From 193120d1677ff0c4c502fc81835251e4b29d0c48 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 21 Jun 2024 18:48:44 +0200
Subject: [PATCH 33/76] cleanup docs-build breaking install instructs (#1244)

* cleanup docs-build breaking install instructs

* Update install instructions for ROCm

* Update installation.mdx

---------

Co-authored-by: Prasanth Nunna <Prasanth.Nunna@amd.com>
Co-authored-by: pnunna93 <104791500+pnunna93@users.noreply.github.com>
---
 docs/source/installation.mdx      | 43 +++++++++++++++++++++++++++++
 docs/source/rocm_installation.mdx | 46 -------------------------------
 2 files changed, 43 insertions(+), 46 deletions(-)
 delete mode 100644 docs/source/rocm_installation.mdx

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..c07ef29f6 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -91,6 +91,49 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
+## Multi-backend preview release (+ compilation)
+
+Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+For a ROCm specific install:
+
+bitsandbytes is fully supported from ROCm 6.1.
+
+**Note:** If you already installed ROCm and PyTorch, skip docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+
+```bash
+# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
+docker pull rocm/pytorch:latest
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
+
+# Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Compile & install
+cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
+make
+pip install .
+```
+
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+WIP
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+WIP
+
+</hfoption>
+</hfoptions>
+
 ## PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
diff --git a/docs/source/rocm_installation.mdx b/docs/source/rocm_installation.mdx
deleted file mode 100644
index 5d4381e7d..000000000
--- a/docs/source/rocm_installation.mdx
+++ /dev/null
@@ -1,46 +0,0 @@
-# ROCm Installation
-
-Please follow these steps to install bitsandbytes on ROCm.
-
-<hfoptions id="rocm_install">
-<hfoption id="latest">
-
-For latest installation:
-
-```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . #Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
-make
-pip install .
-```
-
-<hfoption>
-<hfoption id="version specific">
-
-For ROCm specific versions:
-
-Install Dependencies:
-
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-
-Install bitsandbytes from [ROCm](https://github.com/ROCm/bitsandbytes) repo:
-
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-<hfoption>
-</hfoptions>

From dada530149212d64d4b69534716202659ef37ec8 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Sat, 22 Jun 2024 00:53:26 +0800
Subject: [PATCH 34/76] cpu install guide (#1227)

* cpu install guide

* update readme

* fix format

* fix format

* fix typo

* add windows guide

* fix readme to pip install . instead of building wheel

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

* Update docs/source/installation.mdx

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>

---------

Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com>
---
 docs/source/installation.mdx | 54 ++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..c84d0c2ef 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,5 +1,7 @@
 # Installation
 
+## CUDA
+
 bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
 
 The latest version of bitsandbytes (v0.43.0) builds on:
@@ -29,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-## Compile from source
+### Compile from source
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -91,7 +93,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-## PyTorch CUDA versions
+### PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -131,3 +133,51 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/tim/local/cuda-11.7
 ```
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
+
+
+## Intel CPU
+
+> [!TIP]
+> Intel CPU backend only supports building from source; for now, please follow the instructions below.
+
+Like CUDA, you can compile bitsandbytes from source for Linux and Windows systems. Installing from source allows for more build options with different CMake configurations.
+
+<hfoptions id="source">
+<hfoption id="Linux">
+
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
+
+```bash
+apt-get install -y build-essential cmake
+```
+
+We recommend installing **GCC >= 11** and have at least **GCC >= 6**.
+
+Now to install the bitsandbytes package from source, run the following commands:
+
+```bash
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
+pip install intel_extension_for_pytorch
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install .
+```
+
+</hfoption>
+<hfoption id="Windows">
+
+Windows systems require Visual Studio with C++ support.
+
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed.
+
+```bash
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+cmake --build . --config Release
+pip install .
+```
+
+</hfoption>
+</hfoptions>

From c79b1e926b05e856775e4962253e2dbf67bed103 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 21 Jun 2024 19:02:47 +0200
Subject: [PATCH 35/76] provide temp flag for outside libs to detect
 multi-backend preview (#1243)

* provide temp flag for outside libs to detect multi-backend preview

* fix typo in comment

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>

---------

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 bitsandbytes/__init__.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index eff7fc686..c3a2f2402 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -20,6 +20,13 @@
 from .cextension import lib
 from .nn import modules
 
+# NOTE: this is a temporary flag to allow outside libraries to employ conditional logic while the refactor is still in
+# alpha/beta: sth like `if getattr(bitsandbytes, "is_multi_backend_refactor_preview", False): do sth`
+# the getattr() call above would default to False and any string evaluates to True. This way we have temporary thing
+# that we can remove in Transformers with the next release after the official BNB multi-platform release; then
+# eventually making it the new default (e.g. just remove if statement and dedent in Transformers)
+is_multi_backend_refactor_preview = "TO BE REMOVED ONCE MERGED TO `main`"  # bool evals to True for str
+
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())
 

From 1bfecc81e9f3b9a67a3b9bb9e1ab57468b1b9497 Mon Sep 17 00:00:00 2001
From: Xia Weiwen <xia.weiwen@hotmail.com>
Date: Wed, 10 Jul 2024 15:26:35 +0800
Subject: [PATCH 36/76] CPU/XPU: disable torch.compile if g++ is not available
 (#1251)

* CPU/XPU: disable torch.compile if g++ is not available

* Fix lint issue
---
 bitsandbytes/backends/cpu_xpu_common.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index 396234853..c936dce14 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,3 +1,4 @@
+import subprocess
 from typing import Optional
 import warnings
 
@@ -19,6 +20,14 @@
     ipex_xpu = None
 
 
+gxx_available = False
+try:
+    subprocess.run(["g++", "--version"])
+    gxx_available = True
+except BaseException:
+    warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
+
+
 Tensor = torch.Tensor
 
 
@@ -45,8 +54,8 @@ def _ipex_xpu_version_prereq(major, minor):
 
 
 def _maybe_torch_compile(func):
-    # torch.compile requires pytorch >= 2.0
-    if _torch_version_prereq(2, 0):
+    # torch.compile requires g++ and pytorch >= 2.0
+    if gxx_available and _torch_version_prereq(2, 0):
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):

From 08597844023a5c59e9b5d5dbeafbac4174fae5cc Mon Sep 17 00:00:00 2001
From: pnunna93 <104791500+pnunna93@users.noreply.github.com>
Date: Fri, 12 Jul 2024 09:15:10 -0500
Subject: [PATCH 37/76] Create build job for ROCm (#1255)

* Add build job for rocm

* Add rocm build script
---
 .github/scripts/build-rocm.sh        | 19 +++++++++++++++++++
 .github/workflows/python-package.yml | 22 ++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 .github/scripts/build-rocm.sh

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
new file mode 100644
index 000000000..fc7515aa7
--- /dev/null
+++ b/.github/scripts/build-rocm.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+
+set -xeuo pipefail
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:6.1-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake --build ."
+fi
+
+#output_dir="output/${build_os}/${build_arch}"
+#mkdir -p "${output_dir}"
+#(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..78bc747c3 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -103,6 +103,28 @@ jobs:
           name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
           path: output/*
           retention-days: 7
+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        arch: [x86_64]
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        if: startsWith(matrix.os, 'ubuntu')
+        uses: docker/setup-qemu-action@v2
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
   build-wheels:
     needs:
       - build-shared-libs

From 1935a459f7c72d1ec8349e4ad1e84cbd6ff94e68 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:40:17 +0200
Subject: [PATCH 38/76] fix broken <source> links in autodoc API reference
 (#1275)

* Update build_documentation.yml

* Update build_pr_documentation.yml

* Update build_pr_documentation.yml
---
 .github/workflows/build_documentation.yml    | 2 ++
 .github/workflows/build_pr_documentation.yml | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index a19e7511d..e027f7556 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -14,6 +14,8 @@ jobs:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder
     secrets:
       hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index cc833df5d..b83794a5f 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,11 +9,13 @@ concurrency:
 
 jobs:
   build:
-    if: github.repository == 'TimDettmers/bitsandbytes'
+    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
       repo_owner: TimDettmers
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 85e01276874b7563bd23caf56ac2c3bdbf7c90fc Mon Sep 17 00:00:00 2001
From: Markus Hennerbichler <markushennerbichler@gmail.com>
Date: Fri, 12 Jul 2024 15:40:37 +0100
Subject: [PATCH 39/76] Fix CUDA 12.5 build issue (#1273)

pythonInterface.cpp depends on ops.cuh
which in turn depends on some thrust headers.
It is defined as a C++ compilation unit
which is problematic  becuase thrift doesn't guarantee
compatibility with a host compiler.

This is starting to cause issues with CUDA 12.5.
There is no dependency on the thrust headers,
which means they can be removed without other consequences.
---
 csrc/kernels.cu | 2 --
 csrc/ops.cuh    | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index f4673359b..e4d459961 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -12,8 +12,6 @@
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
 #include <math_constants.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 #include <mma.h>
 
 
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index da9df6af0..8b9a4f449 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -19,10 +19,6 @@
 #include <vector>
 #include <functional>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-
-
 
 #define CUDA_CHECK_RETURN(value) {                      \
   cudaError_t _m_cudaStat = value;                    \

From 6866a4ad464239a3a06c9d8911237c0da294e4d7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:42:28 +0200
Subject: [PATCH 40/76] Bump scipy from 1.13.1 to 1.14.0 in the minor-patch
 group (#1266)

Bumps the minor-patch group with 1 update: [scipy](https://github.com/scipy/scipy).


Updates `scipy` from 1.13.1 to 1.14.0
- [Release notes](https://github.com/scipy/scipy/releases)
- [Commits](https://github.com/scipy/scipy/compare/v1.13.1...v1.14.0)

---
updated-dependencies:
- dependency-name: scipy
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index 285b5e7d8..b36fd6586 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -3,4 +3,4 @@ pytest==8.2.2
 einops==0.8.0
 lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
-scipy==1.13.1; python_version >= "3.9"
+scipy==1.14.0; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 80927a4cb..dc75f9685 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -4,6 +4,6 @@ pytest~=8.2.2
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.2.2
-scipy~=1.13.1
+scipy~=1.14.0
 pandas~=2.2.2
 matplotlib~=3.9.0

From 8c6ab698d76baad7265d1a91965ade0982596704 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:48:26 +0200
Subject: [PATCH 41/76] update repo owner

---
 .github/workflows/build_pr_documentation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index b83794a5f..4679761c6 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -15,7 +15,7 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
       # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
       version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 7be11439954d38a8b784ea86286ac0045769db53 Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Fri, 12 Jul 2024 16:48:51 +0200
Subject: [PATCH 42/76] update repo owner

---
 .github/workflows/build_documentation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index e027f7556..ce4a55aaa 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -13,7 +13,7 @@ jobs:
     with:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
       # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
       version_tag_suffix: ''  # defaults to '/src'
       custom_container: huggingface/transformers-doc-builder

From 6948f0b8fe3295b6c6fe1263bc0d1ce874468cd2 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Mon, 15 Jul 2024 04:58:02 -0400
Subject: [PATCH 43/76] Fix Windows CUDA build compatibility with newest MSVC
 (#1276)

* Add support for building with latest MSVC

* Update MSVC 1940+ support for CUDA builds.
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index be0d3555f..6f3914456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,13 @@ endif()
 
 
 if(BUILD_CUDA)
+    # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
+    # Workaround: use --allow-unsupported-compiler
+    # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
+    if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
+        string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+    endif()
+
     enable_language(CUDA) # This will fail if CUDA is not found
     find_package(CUDAToolkit REQUIRED)
 
@@ -188,7 +195,6 @@ if(WIN32)
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
-# Weird MSVC hacks
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
 endif()

From f2b2310eb4b7034c14e87dca2a61604ea2a0163f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:42:04 +0200
Subject: [PATCH 44/76] Update matplotlib requirement from ~=3.9.0 to ~=3.9.1
 in the major group (#1278)

Updates the requirements on [matplotlib](https://github.com/matplotlib/matplotlib) to permit the latest version.

Updates `matplotlib` to 3.9.1
- [Release notes](https://github.com/matplotlib/matplotlib/releases)
- [Commits](https://github.com/matplotlib/matplotlib/compare/v3.9.0...v3.9.1)

---
updated-dependencies:
- dependency-name: matplotlib
  dependency-type: direct:development
  dependency-group: major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-dev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index dc75f9685..94098e012 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -6,4 +6,4 @@ wheel~=0.43.0
 lion-pytorch~=0.2.2
 scipy~=1.14.0
 pandas~=2.2.2
-matplotlib~=3.9.0
+matplotlib~=3.9.1

From 39b42e749f623193d69b917a01dcb4ca5b4bbdc0 Mon Sep 17 00:00:00 2001
From: Vladimir Malinovskii <galqiwi@galqiwi.ru>
Date: Mon, 15 Jul 2024 17:51:21 +0300
Subject: [PATCH 45/76] Fixed tests for cpu only platforms (#1259)

* fixed test_4bit_warnings on cpu-only platforms

* fixed linear8bit-based tests for cpu only platforms
---
 tests/test_linear8bitlt.py | 2 +-
 tests/test_modules.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index e55abe110..9b7923312 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -182,7 +182,7 @@ def test_linear_serialization(
 
 
 @pytest.fixture
-def linear8bit():
+def linear8bit(requires_cuda):
     linear = torch.nn.Linear(32, 96)
     linear_custom = Linear8bitLt(
         linear.in_features,
diff --git a/tests/test_modules.py b/tests/test_modules.py
index db4d72410..9d507c6b4 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -616,7 +616,7 @@ def test_fp8linear():
     assert bgraderr < 0.00002
 
 
-def test_4bit_warnings():
+def test_4bit_warnings(requires_cuda):
     dim1 = 64
 
     with pytest.warns(UserWarning, match=r"inference or training"):

From 9e75374104cec965d9a4a630a68ce4e2b77b066f Mon Sep 17 00:00:00 2001
From: Ther <1329438302@qq.com>
Date: Tue, 16 Jul 2024 16:56:41 +0800
Subject: [PATCH 46/76] fix QLoRA mem bug: delete useless buffered activation
 (#1270)

* chore: delete useless buffered activation

* fix: fix bugs
---
 bitsandbytes/autograd/_functions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index e9821cd36..d33dd1bc5 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -513,7 +513,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
-            ctx.tensors = (A, B)
+            ctx.tensors = (None, B)
         else:
             ctx.tensors = (None, None)
 
@@ -526,7 +526,7 @@ def backward(ctx, grad_output):
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, _, _, req_gradBias, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
+        _, B = ctx.tensors
 
         grad_A, grad_B, grad_bias = None, None, None
 

From 0bdd57ccb20e9690b95c2fa02315d9507afa69dd Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Sun, 21 Jul 2024 08:31:06 -0400
Subject: [PATCH 47/76] Add CUDA 12.5 and update 12.4 builds (#1284)

* Add CUDA 12.5 builds and enable CUDA 12.4 on Windows

* Update install doc
---
 .github/workflows/python-package.yml |  6 ++----
 docs/source/installation.mdx         |  4 ++--
 install_cuda.py                      |  3 ++-
 install_cuda.sh                      | 10 +++++++---
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..698c21481 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -63,12 +63,10 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
+          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.0"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
-          - os: windows-latest  # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
-            cuda_version: "12.4.0"
           - os: ubuntu-latest # Temporary. Takes too long, not ready yet.
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
@@ -79,7 +77,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
         uses: docker/setup-qemu-action@v2
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@v0.2.14
+      - uses: Jimver/cuda-toolkit@v0.2.16
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index c84d0c2ef..877c97456 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -2,7 +2,7 @@
 
 ## CUDA
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**.
 
 The latest version of bitsandbytes (v0.43.0) builds on:
 
@@ -107,7 +107,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
diff --git a/install_cuda.py b/install_cuda.py
index cf7c8ee71..8267c5e2b 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -17,7 +17,8 @@
     "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run",
     "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run",
     "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run",
-    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run",
+    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run",
+    "125": "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run",
 }
 
 
diff --git a/install_cuda.sh b/install_cuda.sh
index 2e7fe8ed2..0aa9531fc 100644
--- a/install_cuda.sh
+++ b/install_cuda.sh
@@ -11,7 +11,8 @@ URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installer
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
 URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
 URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
-URL124=https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+URL124=https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+URL125=https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run
 
 CUDA_VERSION=$1
 BASE_PATH=$2
@@ -60,11 +61,14 @@ if [[ -n "$CUDA_VERSION" ]]; then
   elif [[ "$CUDA_VERSION" -eq "124" ]]; then
     URL=$URL124
     FOLDER=cuda-12.4
+  elif [[ "$CUDA_VERSION" -eq "125" ]]; then
+    URL=$URL125
+    FOLDER=cuda-12.5
   else
-    echo "argument error: No cuda version passed as input. Choose among versions 110 to 124"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
   fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
 fi
 
 FILE=$(basename $URL)

From 5212a0f2a585abba1cc2a65f82f7c4fc939c453f Mon Sep 17 00:00:00 2001
From: Titus <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:36:31 +0200
Subject: [PATCH 48/76] Edenzzzz's fix for min_8bit_size functionality in
 Optimizer base classes (#1286)

* fix min_8bit_size invalid bug

* Apply same fix to other optimizer base class

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
---
 bitsandbytes/optim/optimizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index f1e60e5e7..39fa0e7ff 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -437,7 +437,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
             state["state2"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
@@ -656,7 +656,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
             if state["step"] == 0:

From a3f55cea3ab29218067809770bc8bf2380ec46cd Mon Sep 17 00:00:00 2001
From: Edenzzzz <wenxuan.tan@wisc.edu>
Date: Mon, 22 Jul 2024 23:14:46 +0800
Subject: [PATCH 49/76] Fixed optim update error with non-contiguous
 grads/params (#1187)

* Fixed optim update error with non-contiguous grads
* fix formatting

Thanks @Edenzzzz for this contribution!

---------

Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 bitsandbytes/optim/optimizer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index 39fa0e7ff..e9c857d49 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -474,6 +474,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 
@@ -685,6 +689,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 

From e3ae243be2f8bfb36715610e837363a515840b39 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 22 Jul 2024 17:15:51 +0200
Subject: [PATCH 50/76] Bump pytest from 8.2.2 to 8.3.1 in the minor-patch
 group (#1287)

Bumps the minor-patch group with 1 update: [pytest](https://github.com/pytest-dev/pytest).


Updates `pytest` from 8.2.2 to 8.3.1
- [Release notes](https://github.com/pytest-dev/pytest/releases)
- [Changelog](https://github.com/pytest-dev/pytest/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest/compare/8.2.2...8.3.1)

---
updated-dependencies:
- dependency-name: pytest
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: minor-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-ci.txt  | 2 +-
 requirements-dev.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-ci.txt b/requirements-ci.txt
index b36fd6586..182e1023e 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,5 +1,5 @@
 # Requirements used for GitHub actions
-pytest==8.2.2
+pytest==8.3.1
 einops==0.8.0
 lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 94098e012..41211880c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,6 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.2
+pytest~=8.3.1
 einops~=0.8.0
 wheel~=0.43.0
 lion-pytorch~=0.2.2

From 7fed393aa8380f2d7f7c760bbd6a2f68b5caa9ea Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 23 Jul 2024 11:32:50 -0400
Subject: [PATCH 51/76] Fix restoration of quant_storage for CPU offloading
 (#1279)

* Fix restoration of quant_storage for CPU offloading

* Clarify comment on default quant_storage in Params4bit.from_prequantized()

* fix to make quant_storage dynamic based on serialized dtype

* delete obsolete comment

---------

Co-authored-by: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
---
 bitsandbytes/nn/modules.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 05f7c04db..40766ad41 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -282,10 +282,13 @@ def from_prequantized(
         self.compress_statistics = self.quant_state.nested
         self.quant_type = self.quant_state.quant_type
         self.bnb_quantized = True
+
+        self.quant_storage = data.dtype
+
         return self
 
     def _quantize(self, device):
-        w = self.data.contiguous().cuda(device)
+        w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
             w,
             blocksize=self.blocksize,
@@ -333,6 +336,7 @@ def to(self, *args, **kwargs):
                 blocksize=self.blocksize,
                 compress_statistics=self.compress_statistics,
                 quant_type=self.quant_type,
+                quant_storage=self.quant_storage,
             )
 
             return new_param
@@ -450,7 +454,7 @@ def forward(self, x: torch.Tensor):
                 # since we registered the module, we can recover the state here
                 assert self.weight.shape[1] == 1
                 if not isinstance(self.weight, Params4bit):
-                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage)
+                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage, bnb_quantized=True)
                 self.weight.quant_state = self.quant_state
             else:
                 print(

From 1571110648dc5b0e603316c9ce2b0f16ac85cdbb Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:40:31 +0000
Subject: [PATCH 52/76] remove unnecessary version mention

---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 877c97456..8187fbf81 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -4,7 +4,7 @@
 
 bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**.
 
-The latest version of bitsandbytes (v0.43.0) builds on:
+The latest version of bitsandbytes builds on:
 
 | OS | CUDA | Compiler |
 |---|---|---|

From ce53caf3c358ec3f81db6a9edc0b6fc2f17d9503 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 17:45:26 +0000
Subject: [PATCH 53/76] release 0.43.2

---
 CHANGELOG.md             | 18 ++++++++++++++++++
 _typos.toml              |  5 +++++
 bitsandbytes/__init__.py |  2 +-
 setup.py                 |  2 +-
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c456fa9e5..8ad648df1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,21 @@
+### 0.43.2
+
+#### Improvements:
+
+- docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)
+- Add CUDA 12.5 and update 12.4 builds (#1284)
+
+#### Bug Fixes
+
+- 4bit getstate and 8bit deepcopy (#1230 #1231, thanks @BenjaminBossan)
+- missing optimizers in `str2optimizer32bit` (#1222, thanks @EtienneDosSantos)
+- CUDA 12.5 build issue (#1273, thanks @HennerM)
+- fix for min_8bit_size functionality in Optimizer base classes (#1286, thanks @Edenzzzz)
+- QLoRA mem bug (#1270, thanks @Ther-nullptr)
+- tests for cpu only platforms (#1259, thanks @galqiwi)
+- restoration of quant_storage for CPU offloading (#1279)
+- optim update error with non-contiguous grads/params (deepspeed) (#1187)
+
 ### 0.43.1
 
 #### Improvements:
diff --git a/_typos.toml b/_typos.toml
index a04206b8d..e4e7287fb 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,5 +1,10 @@
 [files]
 
+[default]
+extend-ignore-re = [
+    "@Ther-nul",  # valid Github user
+]
+
 [default.extend-identifiers]
 
 [type.py.extend-words]
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 51cbde208..ad5f01539 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,4 +21,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2.dev"
+__version__ = "0.43.2"
diff --git a/setup.py b/setup.py
index f8d6a92a1..d2b78f6b5 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2.dev",
+    version="0.43.2",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From a7c08afd70af46646ccdebcd6bb459b66f0e9e54 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 18:26:24 +0000
Subject: [PATCH 54/76] bump version tag to next dev

---
 bitsandbytes/__init__.py | 2 +-
 setup.py                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index ad5f01539..a8acfbfc5 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -21,4 +21,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2"
+__version__ = "0.43.3.dev"
diff --git a/setup.py b/setup.py
index d2b78f6b5..18de0fe5b 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2",
+    version="0.43.3.dev",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From 9b726798542e01c45a7a4a841e144311980b90d6 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:13:24 +0000
Subject: [PATCH 55/76] Changelog: add explanation r. QLoRA mem savings

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ad648df1..e446155b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 ### 0.43.2
 
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for seqlen 1024, and 4888GB for 128k
+- 70B: 20.1GB for 1024 and 2516GB for 128k
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
 #### Improvements:
 
 - docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)

From 81375f8e67e9433c778fce3011930159357271c8 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 13:11:00 +0000
Subject: [PATCH 56/76] docs: add more details to Intel install

---
 docs/source/installation.mdx | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 5b2cfe1d3..2f8fe4db7 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -31,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-### Compile from source
+### Compile from source[[compile]]
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -174,7 +174,18 @@ pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise
 > [!TIP]
 > Intel CPU backend only supports building from source; for now, please follow the instructions below.
 
-Like CUDA, you can compile bitsandbytes from source for Linux and Windows systems. Installing from source allows for more build options with different CMake configurations.
+Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
+
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+
+```
+git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install intel_extension_for_pytorch
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
 
 </hfoption>
 <hfoption id="Apple Silicon (MPS)">

From 7800734637d9982b3aab9f4606f6f06456073a83 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:13:24 +0000
Subject: [PATCH 57/76] Changelog: add explanation r. QLoRA mem savings

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8ad648df1..ed324f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
 ### 0.43.2
 
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for `seqlen=1024`, and 4888GB for `seqlen=128,00`
+- 70B: 10.1GB for `seqlen=1024` and  1258GB for `seqlen=128,00`
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
 #### Improvements:
 
 - docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)

From 24f7b652cec822849fba69c583b8e73d84446627 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:08:30 +0000
Subject: [PATCH 58/76] docs: cleanup of compilation instructions

---
 docs/source/installation.mdx | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 2f8fe4db7..f917f2623 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -2,7 +2,7 @@
 
 ## CUDA
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. There's a multi-backend effort under way which is currently in alpha release, see further down in this document.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
 
 The latest version of bitsandbytes builds on:
 
@@ -134,7 +134,7 @@ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
 
-## Multi-backend preview release (+ compilation)
+## Multi-backend preview release compilation[[multi-backend]]
 
 Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
 
@@ -143,11 +143,10 @@ Please follow these steps to install bitsandbytes with device-specific backend s
 
 ### AMD GPU
 
-For a ROCm specific install:
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
-bitsandbytes is fully supported from ROCm 6.1.
-
-**Note:** If you already installed ROCm and PyTorch, skip docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+> [!TIP]
+> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
 
 ```bash
 # Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
@@ -161,6 +160,7 @@ git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bit
 pip install -r requirements-dev.txt
 
 # Compile & install
+apt-get install -y build-essential cmake  # install build tools dependencies, unless present
 cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
 make
 pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
@@ -179,7 +179,7 @@ Similar to the CUDA case, you can compile bitsandbytes from source for Linux and
 The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
 
 ```
-git clone --branch multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 pip install intel_extension_for_pytorch
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cpu -S .

From e3b27805346b7d55a5ca4ba91fb374415c11dc05 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Sat, 27 Jul 2024 14:16:49 +0000
Subject: [PATCH 59/76] docs: CHANGELOG.md fix

---
 CHANGELOG.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e446155b0..ed324f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,8 +3,8 @@
 This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
 
 For each sequence (i.e. batch size increase of one) we expect memory savings of:
-- 405B: 39GB for seqlen 1024, and 4888GB for 128k
-- 70B: 20.1GB for 1024 and 2516GB for 128k
+- 405B: 39GB for `seqlen=1024`, and 4888GB for `seqlen=128,00`
+- 70B: 10.1GB for `seqlen=1024` and  1258GB for `seqlen=128,00`
 
 This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
 

From c8b4b33ef40d240b9650268dfe6ae15ac5472664 Mon Sep 17 00:00:00 2001
From: jiqing-feng <107918818+jiqing-feng@users.noreply.github.com>
Date: Sat, 27 Jul 2024 23:28:30 +0800
Subject: [PATCH 60/76] fix dtype mismatch (#1285)

---
 bitsandbytes/backends/cpu_xpu_common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index c936dce14..04755ed2d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -524,7 +524,7 @@ def gemm_4bit_impl(
         output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
     else:
         dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
-        output = torch.matmul(A, dqB)
+        output = torch.matmul(A, dqB.to(A.dtype))
     if out is not None:
         out.copy_(output)
     else:

From fd655b02663d1f692734b1a6376421dfbe1064b9 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:00:16 +0000
Subject: [PATCH 61/76] Add ROCm version to .so filename

---
 CMakeLists.txt             | 2 ++
 bitsandbytes/cextension.py | 6 ++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bedefd51..c526678c7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -192,7 +192,9 @@ elseif(BUILD_HIP)
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
     string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+    string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
 
+    string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
     if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
         string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
     endif()
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 03d2cbd61..6e391a752 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+        BNB_HIP_VERSION_SHORT = str(hip_major) + str(hip_minor)
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
+        BNB_HIP_VERSION_SHORT = ""
     lib = get_native_library()
 except Exception as e:
     lib = None

From 6b77f4c3f7aa042518d566489e13b774c96f68e3 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:16:57 +0000
Subject: [PATCH 62/76] Add rocm_version to whls build

---
 .github/scripts/build-rocm.sh        | 3 ++-
 .github/workflows/python-package.yml | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index 616e8c250..cc15210fd 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
 declare build_arch
 declare build_os
+declare rocm_version
 
 set -xeuo pipefail
 if [ "${build_os:0:6}" == ubuntu ]; then
-	image=rocm/dev-ubuntu-22.04:6.1-complete
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
 	docker run --rm --platform "linux/$build_arch" -i \
 		-w /src -v "$PWD:/src" "$image" sh -c \
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 0b0b35416..cab735562 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -108,6 +108,8 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         arch: [x86_64]
+        rocm_version:
+          ["6.1.2"]
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
     steps:
       - uses: actions/checkout@v4
@@ -128,7 +130,7 @@ jobs:
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:
-          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
           path: output/*
           retention-days: 7
   build-wheels:

From 78324b32075b7ae6076c304e1dfd5f71db01704b Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:23:24 +0000
Subject: [PATCH 63/76] Revert "Remove cuda build temporarily"

This reverts commit 1413c5f3a2aed51140b86daa8ee9283c67cce738.
---
 .github/workflows/python-package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index cab735562..6a4a6205b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -136,6 +136,7 @@ jobs:
   build-wheels:
     needs:
       - build-shared-libs
+      - build-shared-libs-cuda
       - build-shared-libs-rocm
     strategy:
       matrix:

From c146b8b8f2fe9d6fec5f1f1b8da25b1ec60d6ac6 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Mon, 29 Jul 2024 21:36:41 +0000
Subject: [PATCH 64/76] Add rocm_version env var

---
 .github/workflows/python-package.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 6a4a6205b..f4cc5486b 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -127,6 +127,7 @@ jobs:
         env:
           build_os: ${{ matrix.os }}
           build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
       - name: Upload build artifact
         uses: actions/upload-artifact@v4
         with:

From d6c3df47a46d55c093e9bd8bf61ee3489bac605e Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 05:01:18 +0000
Subject: [PATCH 65/76] Remove thrush header files

---
 csrc/kernels.hip | 2 --
 csrc/ops_hip.cuh | 6 ------
 2 files changed, 8 deletions(-)

diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index ca77dceda..d8d7cdba5 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -10,8 +10,6 @@
 #include <hipcub/hipcub.hpp>
 #include <hip/hip_math_constants.h>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 //#include <mma.h>
 
 
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index 1b9c13063..e57cbb3b5 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -21,12 +21,6 @@
 #include <vector>
 #include <functional>
 
-/*
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-*/
-
-
 #define CUDA_CHECK_RETURN(value) {                      \
   hipError_t _m_cudaStat = value;                    \
   if (_m_cudaStat != hipSuccess) {                   \

From 7e9a65c33f66fffcb14ee2438170718777c06022 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:44:49 +0000
Subject: [PATCH 66/76] Print node info

---
 .github/scripts/build-rocm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index cc15210fd..8aac2c04b 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -11,7 +11,7 @@ if [ "${build_os:0:6}" == ubuntu ]; then
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip . \
+      && rocminfo && cmake -DCOMPUTE_BACKEND=hip . \
       && cmake --build ."
 fi
 

From cdb209a2eb896d9c4166f53e9b2aa580c10e42c0 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:51:34 +0000
Subject: [PATCH 67/76] print cuda node info

---
 .github/scripts/build-cuda.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
index 0f9b8d726..fc79a92f6 100644
--- a/.github/scripts/build-cuda.sh
+++ b/.github/scripts/build-cuda.sh
@@ -15,7 +15,7 @@ for NO_CUBLASLT in ON OFF; do
 		docker run --platform "linux/$build_arch" -i -w /src -v "$PWD:/src" "$image" sh -c \
 			"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
+      && nvidia-smi && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
       && cmake --build ."
 	else
 		pip install cmake==3.28.3

From 77e149917dd4bb5be87099289edf53421fef6fe8 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:56:11 +0000
Subject: [PATCH 68/76] Revert "print cuda node info"

This reverts commit cdb209a2eb896d9c4166f53e9b2aa580c10e42c0.
---
 .github/scripts/build-cuda.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-cuda.sh b/.github/scripts/build-cuda.sh
index fc79a92f6..0f9b8d726 100644
--- a/.github/scripts/build-cuda.sh
+++ b/.github/scripts/build-cuda.sh
@@ -15,7 +15,7 @@ for NO_CUBLASLT in ON OFF; do
 		docker run --platform "linux/$build_arch" -i -w /src -v "$PWD:/src" "$image" sh -c \
 			"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && nvidia-smi && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
+      && cmake -DCOMPUTE_BACKEND=cuda -DCOMPUTE_CAPABILITY=\"${build_capability}\" -DNO_CUBLASLT=${NO_CUBLASLT} . \
       && cmake --build ."
 	else
 		pip install cmake==3.28.3

From 7c9190990478d3980eece86909b5faf4d36b3e16 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 06:56:54 +0000
Subject: [PATCH 69/76] Revert "Print node info"

This reverts commit 7e9a65c33f66fffcb14ee2438170718777c06022.
---
 .github/scripts/build-rocm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index 8aac2c04b..cc15210fd 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -11,7 +11,7 @@ if [ "${build_os:0:6}" == ubuntu ]; then
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && rocminfo && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake -DCOMPUTE_BACKEND=hip . \
       && cmake --build ."
 fi
 

From b78b3400b59e2f40dad5ba4f676e7ffd46dff978 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 07:14:18 +0000
Subject: [PATCH 70/76] Add rocm arch to compile command

---
 .github/scripts/build-rocm.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index cc15210fd..a5933da3f 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -4,6 +4,7 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
+bnb_rocm_arch="gfx906;gfx908;gfx90a;gfx942;gfx1100;gfx1030"
 if [ "${build_os:0:6}" == ubuntu ]; then
 	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
@@ -11,7 +12,7 @@ if [ "${build_os:0:6}" == ubuntu ]; then
 		-w /src -v "$PWD:/src" "$image" sh -c \
 		"apt-get update \
       && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
-      && cmake -DCOMPUTE_BACKEND=hip . \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
       && cmake --build ."
 fi
 

From a62b9d454b25f5f7933e7793fd553cc81699a925 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 19:21:47 +0000
Subject: [PATCH 71/76] Rename .so files to rocm

---
 CMakeLists.txt             | 2 +-
 bitsandbytes/cextension.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7a7671239..0891f75b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,7 +194,7 @@ elseif(BUILD_HIP)
 
     list(APPEND SRC_FILES ${HIP_FILES})
 
-    string(APPEND BNB_OUTPUT_NAME "_hip")
+    string(APPEND BNB_OUTPUT_NAME "_rocm")
 
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 6e391a752..a096d0d51 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt

From 9059bff540eac1b871eca220120c37a32186d481 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 19:27:12 +0000
Subject: [PATCH 72/76] Update default gpu arch

---
 .github/scripts/build-rocm.sh | 2 +-
 CMakeLists.txt                | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
index a5933da3f..b508fac69 100644
--- a/.github/scripts/build-rocm.sh
+++ b/.github/scripts/build-rocm.sh
@@ -4,7 +4,7 @@ declare build_os
 declare rocm_version
 
 set -xeuo pipefail
-bnb_rocm_arch="gfx906;gfx908;gfx90a;gfx942;gfx1100;gfx1030"
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
 if [ "${build_os:0:6}" == ubuntu ]; then
 	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
 	echo "Using image $image"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0891f75b0..eac72fe52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,7 +185,7 @@ elseif(BUILD_HIP)
       set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
     else()
       if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
       elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
       endif()

From c5a406ad9aed7738a236c9c184a4b1bf2ccd422c Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 19:48:32 +0000
Subject: [PATCH 73/76] Skip cpu based igemmlt int tests on ROCm

---
 tests/test_functional.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 4e82c530a..a9d926b89 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
 @pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
+    if HIP_ENVIRONMENT and device == "cpu":
+        pytest.skip("this test is not supported on ROCm yet")
+
     for i in range(k):
         if dims == 2:
             A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)

From 9cbb5e12a8987c57188917e5353e46492ef8d1eb Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 21:19:53 +0000
Subject: [PATCH 74/76] Update Documentation

---
 docs/source/installation.mdx | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index f917f2623..0daa5e279 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -146,13 +146,23 @@ Please follow these steps to install bitsandbytes with device-specific backend s
 bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
 
 > [!TIP]
-> If you already installed ROCm and PyTorch, skip Docker steps below and please check that the torch version matches your ROCm install. To install torch for a specific ROCm version, please refer to step 3 of wheels install in [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) guide.
+> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
 
 ```bash
-# Create a docker container with latest pytorch. It comes with ROCm and pytorch preinstalled
-docker pull rocm/pytorch:latest
-docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/pytorch:latest
+# Create a docker container with latest ROCm image, which includes ROCm libraries
+docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
+apt-get update && apt-get install -y git && cd home
 
+# Install pytorch compatible with above ROCm version
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.1/
+
+# Install bitsandbytes from PyPI
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# Please install from source as given below if your configuration doesn't match with these)
+pip install bitsandbytes
+
+# Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
 git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 

From 358062473b3ec475ecf14dd7effc2b02754ff947 Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 21:23:07 +0000
Subject: [PATCH 75/76] Update upstream repo name

---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 0daa5e279..a71fe3261 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -164,7 +164,7 @@ pip install bitsandbytes
 
 # Install bitsandbytes from source
 # Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
-git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
 
 # Install dependencies
 pip install -r requirements-dev.txt

From 3bde1b7bc3b52163d8f35bf654e933879213992e Mon Sep 17 00:00:00 2001
From: Prasanth Nunna <Prasanth.Nunna@amd.com>
Date: Tue, 30 Jul 2024 21:28:39 +0000
Subject: [PATCH 76/76] Update docs

---
 docs/source/installation.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index a71fe3261..3ed694ac1 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -159,7 +159,7 @@ pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/roc
 
 # Install bitsandbytes from PyPI
 # (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
-# Please install from source as given below if your configuration doesn't match with these)
+# Please install from source if your configuration doesn't match with these)
 pip install bitsandbytes
 
 # Install bitsandbytes from source