diff --git a/.github/scripts/build-rocm.sh b/.github/scripts/build-rocm.sh
new file mode 100644
index 000000000..b508fac69
--- /dev/null
+++ b/.github/scripts/build-rocm.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+declare build_arch
+declare build_os
+declare rocm_version
+
+set -xeuo pipefail
+bnb_rocm_arch="gfx90a;gfx942;gfx1100"
+if [ "${build_os:0:6}" == ubuntu ]; then
+	image=rocm/dev-ubuntu-22.04:${rocm_version}-complete
+	echo "Using image $image"
+	docker run --rm --platform "linux/$build_arch" -i \
+		-w /src -v "$PWD:/src" "$image" sh -c \
+		"apt-get update \
+      && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+      && cmake -DCOMPUTE_BACKEND=hip -DBNB_ROCM_ARCH=\"${bnb_rocm_arch}\" . \
+      && cmake --build ."
+fi
+
+output_dir="output/${build_os}/${build_arch}"
+mkdir -p "${output_dir}"
+(shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} "${output_dir}")
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
index 10272be87..ce4a55aaa 100644
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@@ -13,6 +13,9 @@ jobs:
     with:
       commit_sha: ${{ github.sha }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
+      custom_container: huggingface/transformers-doc-builder
     secrets:
       hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index d6455fd11..4679761c6 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -9,10 +9,13 @@ concurrency:
 
 jobs:
   build:
-    if: github.repository == 'TimDettmers/bitsandbytes'
+    if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: bitsandbytes
-      repo_owner: TimDettmers
+      repo_owner: bitsandbytes-foundation
+      # avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
+      version_tag_suffix: ''  # defaults to '/src'
+      custom_container: huggingface/transformers-doc-builder
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index 72e1b099a..91e6d82a6 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -63,12 +63,10 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
+          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.0"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
-          - os: windows-latest  # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
-            cuda_version: "12.4.0"
           - os: ubuntu-latest # Temporary. Takes too long, not ready yet.
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
@@ -79,7 +77,7 @@ jobs:
         if: startsWith(matrix.os, 'ubuntu')
         uses: docker/setup-qemu-action@v2
         # Windows: We install Cuda on the agent (slow)
-      - uses: Jimver/cuda-toolkit@v0.2.14
+      - uses: Jimver/cuda-toolkit@v0.2.16
         if: startsWith(matrix.os, 'windows')
         id: cuda-toolkit
         with:
@@ -103,10 +101,42 @@ jobs:
           name: shared_library_cuda_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.cuda_version }}
           path: output/*
           retention-days: 7
+  build-shared-libs-rocm:
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        arch: [x86_64]
+        rocm_version:
+          ["6.1.2"]
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Docker multiarch
+        if: startsWith(matrix.os, 'ubuntu')
+        uses: docker/setup-qemu-action@v2
+      - name: Clean up disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /opt/ghc
+          sudo rm -rf "/usr/local/share/boost"
+          sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+      - name: Build C++
+        run: bash .github/scripts/build-rocm.sh
+        env:
+          build_os: ${{ matrix.os }}
+          build_arch: ${{ matrix.arch }}
+          rocm_version: ${{ matrix.rocm_version }}
+      - name: Upload build artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: shared_library_rocm_${{ matrix.os }}_${{ matrix.arch }}_${{ matrix.rocm_version }}
+          path: output/*
+          retention-days: 7
   build-wheels:
     needs:
       - build-shared-libs
       - build-shared-libs-cuda
+      - build-shared-libs-rocm
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c456fa9e5..ed324f09e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+### 0.43.2
+
+This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.
+
+For each sequence (i.e. batch size increase of one) we expect memory savings of:
+- 405B: 39GB for `seqlen=1024`, and 4888GB for `seqlen=128,00`
+- 70B: 10.1GB for `seqlen=1024` and  1258GB for `seqlen=128,00`
+
+This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.
+
+#### Improvements:
+
+- docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)
+- Add CUDA 12.5 and update 12.4 builds (#1284)
+
+#### Bug Fixes
+
+- 4bit getstate and 8bit deepcopy (#1230 #1231, thanks @BenjaminBossan)
+- missing optimizers in `str2optimizer32bit` (#1222, thanks @EtienneDosSantos)
+- CUDA 12.5 build issue (#1273, thanks @HennerM)
+- fix for min_8bit_size functionality in Optimizer base classes (#1286, thanks @Edenzzzz)
+- QLoRA mem bug (#1270, thanks @Ther-nullptr)
+- tests for cpu only platforms (#1259, thanks @galqiwi)
+- restoration of quant_storage for CPU offloading (#1279)
+- optim update error with non-contiguous grads/params (deepspeed) (#1187)
+
 ### 0.43.1
 
 #### Improvements:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bedefd51..eac72fe52 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,13 @@ endif()
 
 
 if(BUILD_CUDA)
+    # NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
+    # Workaround: use --allow-unsupported-compiler
+    # This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
+    if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
+        string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+    endif()
+
     enable_language(CUDA) # This will fail if CUDA is not found
     find_package(CUDAToolkit REQUIRED)
 
@@ -178,7 +185,7 @@ elseif(BUILD_HIP)
       set(CMAKE_HIP_ARCHITECTURES ${BNB_ROCM_ARCH})
     else()
       if (NOT AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES "gfx908;gfx90a;gfx940;gfx941;gfx942")
+        set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx942;gfx1100")
       elseif (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
         set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
       endif()
@@ -187,12 +194,14 @@ elseif(BUILD_HIP)
 
     list(APPEND SRC_FILES ${HIP_FILES})
 
-    string(APPEND BNB_OUTPUT_NAME "_hip")
+    string(APPEND BNB_OUTPUT_NAME "_rocm")
 
     # get hip version
     execute_process(COMMAND hipconfig --version OUTPUT_VARIABLE HIP_CONFIG_VERSION)
     string(REGEX MATCH "[0-9]+\\.[0-9]+" HIP_VERSION "${HIP_CONFIG_VERSION}")
+    string(REPLACE "." "" HIP_VERSION_SHORT "${HIP_VERSION}")
 
+    string(APPEND BNB_OUTPUT_NAME "${HIP_VERSION_SHORT}")
     if(NO_CUBLASLT OR HIP_VERSION VERSION_LESS "6.1")
         string(APPEND BNB_OUTPUT_NAME "_nohipblaslt")
     endif()
@@ -229,7 +238,6 @@ if(WIN32)
     set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 endif()
 
-# Weird MSVC hacks
 if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
 endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0fae0ace5..76d7327a8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,23 +9,12 @@ We actively welcome your pull requests.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes.
-5. Make sure your code lints.
-6. If you haven't already, complete the Contributor License Agreement ("CLA").
-
-## Contributor License Agreement ("CLA")
-In order to accept your pull request, we need you to submit a CLA. You only need
-to do this once to work on any of Facebook's open source projects.
-
-Complete your CLA here: <https://code.facebook.com/cla>
+5. Make sure your code lints, install the [pre-commit hooks as documented here](https://huggingface.co/docs/bitsandbytes/main/en/contributing#setup-pre-commit-hooks).
 
 ## Issues
 We use GitHub issues to track public bugs. Please ensure your description is
 clear and has sufficient instructions to be able to reproduce the issue.
 
-Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
-disclosure of security bugs. In those cases, please go through the process
-outlined on that page and do not file a public issue.
-
 ## License
 By contributing to bitsandbytes, you agree that your contributions will be licensed
 under the LICENSE file in the root directory of this source tree.
diff --git a/README.md b/README.md
index 2cf630dcb..7823168ac 100644
--- a/README.md
+++ b/README.md
@@ -12,8 +12,18 @@ There are ongoing efforts to support further hardware backends, i.e. Intel CPU +
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 
+## ALPHA TESTERS WANTED: `multi-backend-refactor` AMD GPU + Intel CPU/GPU specific BNB backend implementations
+
+We're in the process of a complex refactor in order to allow the support of additional hardware backends, other than CUDA, in BNB. The efforts around this are already quite far along and there's plenty of functionality already in place that is in need for users to take a hands-on approach! Mac support will likely soon also see progress. However, I recommend waiting 2 weeks until the device abstraction has further consolidated (**breaking changes upcoming**).
+
+Currently, you still need to compile from source, after checking out the `multi-backend-refactor` branch (instructions WIP, but [the current docs on the compilation from source](https://huggingface.co/docs/bitsandbytes/main/en/installation#compile-from-source) are a good starting point; [feel free to share tips / input in this Github discussion](https://github.com/TimDettmers/bitsandbytes/discussions/1219). We'll soon enable nightly releases to make this much easier for you!
+
+Please give feedback to us in [this dedicated Github Discussion space](https://github.com/TimDettmers/bitsandbytes/discussions/categories/catch-all-alpha-testing-the-multi-backend-refactor)!
+
+We're super excited about these recent developments and grateful for any constructive input or support that you can give to help us make this a reality. BNB is a community project and we're excited for your collaboration 🤗
+
 ## License
 
-The majority of bitsandbytes is licensed under MIT, however small portions of the project are available under separate license terms, as the parts adapted from Pytorch are licensed under the BSD license.
+`bitsandbytes` is MIT licensed.
 
 We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization.
diff --git a/_typos.toml b/_typos.toml
index a04206b8d..e4e7287fb 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -1,5 +1,10 @@
 [files]
 
+[default]
+extend-ignore-re = [
+    "@Ther-nul",  # valid Github user
+]
+
 [default.extend-identifiers]
 
 [type.py.extend-words]
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 760a8eda4..129ac1536 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -16,9 +16,17 @@
 )
 from .backends import register_backend
 from .backends.cpu import CPUBackend
+from .backends.npu import NPUBackend
 from .cextension import lib
 from .nn import modules
 
+# NOTE: this is a temporary flag to allow outside libraries to employ conditional logic while the refactor is still in
+# alpha/beta: sth like `if getattr(bitsandbytes, "is_multi_backend_refactor_preview", False): do sth`
+# the getattr() call above would default to False and any string evaluates to True. This way we have temporary thing
+# that we can remove in Transformers with the next release after the official BNB multi-platform release; then
+# eventually making it the new default (e.g. just remove if statement and dedent in Transformers)
+is_multi_backend_refactor_preview = "TO BE REMOVED ONCE MERGED TO `main`"  # bool evals to True for str
+
 # Always register the CPU backend.
 register_backend("cpu", CPUBackend())
 
@@ -49,11 +57,14 @@
 
     register_backend("xpu", XPUBackend())
 
+# Register Ascend NPU backend, if available.
+if hasattr(torch, "npu") and torch.npu.is_available():
+    register_backend("npu", NPUBackend())
+
 # TODO: Other potential backends:
 # XLA - Google TPU / PJRT runtime
 # HPU - Habana / Intel Gaudi
 # IPU - Graphcore
-# NPU - Ascend
 # Note that we may not map 1:1 with a device type, e.g. SYCL, XLA
 # In this case, it will be up to each backend to dispatch as needed
 
@@ -63,4 +74,4 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.43.2.dev"
+__version__ = "0.43.3.dev"
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 71943915b..59e26ad09 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -524,7 +524,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
         ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype
 
         if any(ctx.needs_input_grad[:2]):
-            ctx.tensors = (A, B)
+            ctx.tensors = (None, B)
         else:
             ctx.tensors = (None, None)
 
@@ -537,7 +537,7 @@ def backward(ctx, grad_output):
             return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None
 
         req_gradA, _, _, req_gradBias, _ = ctx.needs_input_grad
-        A, B = ctx.tensors
+        _, B = ctx.tensors
 
         grad_A, grad_B, grad_bias = None, None, None
 
@@ -575,7 +575,8 @@ def matmul_4bit(
     bias=None,
 ):
     assert quant_state is not None
-    if A.numel() == A.shape[-1] and A.requires_grad == False:
+    if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
+        # CPU backend does not require A to be a vector
         if A.shape[-1] % quant_state.blocksize != 0:
             warn(
                 f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
index d6a9192e4..5d38171d5 100644
--- a/bitsandbytes/backends/cpu.py
+++ b/bitsandbytes/backends/cpu.py
@@ -6,9 +6,12 @@
 
 from .base import Backend
 from .cpu_xpu_common import (
+    dequantize_4bit_impl,
     double_quant_impl,
+    gemm_4bit_impl,
     igemmlt_impl,
     mm_dequant_impl,
+    quantize_4bit_impl,
 )
 
 Tensor = torch.Tensor
@@ -132,7 +135,11 @@ def quantize_4bit(
         quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        if blocksize is None:
+            blocksize = 64
+        assert_on_cpu([A, absmax, out])
+        assert quant_storage == torch.uint8, "CPU backend only supports uint8 quant_storage"
+        return quantize_4bit_impl(A, absmax, out, blocksize, compress_statistics, quant_type)
 
     def dequantize_4bit(
         self,
@@ -143,7 +150,10 @@ def dequantize_4bit(
         blocksize: int = 64,
         quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        if blocksize is None:
+            blocksize = 64
+        assert_on_cpu([A, absmax, out])
+        return dequantize_4bit_impl(A, quant_state, absmax, out, blocksize, quant_type)
 
     def gemv_4bit(
         self,
@@ -154,7 +164,11 @@ def gemv_4bit(
         transposed_B=False,
         state: QuantState = None,
     ) -> torch.Tensor:
-        raise NotImplementedError("Not yet implemented for CPU backend")
+        assert_on_cpu([A, B, out])
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+
+        return gemm_4bit_impl(A, B, out, transposed_A, transposed_B, state)
 
     def dequantize_blockwise(
         self,
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
index f4e5ed3ec..04755ed2d 100644
--- a/bitsandbytes/backends/cpu_xpu_common.py
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -1,7 +1,14 @@
+import subprocess
+from typing import Optional
 import warnings
 
 import torch
 
+from bitsandbytes.functional import (
+    QuantState,
+    get_4bit_type,
+)
+
 try:
     # to support Intel CPU/GPU (XPU) backend
     import intel_extension_for_pytorch as ipex
@@ -13,6 +20,14 @@
     ipex_xpu = None
 
 
+gxx_available = False
+try:
+    subprocess.run(["g++", "--version"])
+    gxx_available = True
+except BaseException:
+    warnings.warn("g++ not found, torch.compile disabled for CPU/XPU.")
+
+
 Tensor = torch.Tensor
 
 
@@ -39,8 +54,8 @@ def _ipex_xpu_version_prereq(major, minor):
 
 
 def _maybe_torch_compile(func):
-    # torch.compile requires pytorch >= 2.0
-    if _torch_version_prereq(2, 0):
+    # torch.compile requires g++ and pytorch >= 2.0
+    if gxx_available and _torch_version_prereq(2, 0):
         options = {}
         # fx_graph_cache requires pytorch >= 2.2
         if _torch_version_prereq(2, 2):
@@ -49,7 +64,7 @@ def _maybe_torch_compile(func):
     return func
 
 
-# Don't use torch.compile for now due to PyTorch issue https://github.com/pytorch/pytorch/issues/124382
+@_maybe_torch_compile
 def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
     """
     Find absolute max values of each row/column of a tensor, and symmetrically quantize it to int8.
@@ -228,3 +243,290 @@ def mm_dequant_impl(
         out = out + bias.to(compute_dtype)
     out = out.to(output_dtype)
     return out
+
+
+NF4_QUANT_TABLE = [
+    -1.0 - 1e-2,  # 0b0000
+    -0.8480964004993439,  # 0b0001
+    -0.6106329262256622,  # 0b0010
+    -0.4599952697753906,  # 0b0011
+    -0.33967943489551544,  # 0b0100
+    -0.23460740596055984,  # 0b0101
+    -0.13791173323988914,  # 0b0110
+    -0.045525018125772476,  # 0b0111
+    0.03979014977812767,  # 0b1000
+    0.1202552504837513,  # 0b1001
+    0.2035212516784668,  # 0b1010
+    0.2920137718319893,  # 0b1011
+    0.3893125355243683,  # 0b1100
+    0.5016634166240692,  # 0b1101
+    0.6427869200706482,  # 0b1110
+    0.8614784181118011,  # 0b1111
+]
+
+
+FP4_QUANT_TABLE = {
+    0 - 1e-2: 0,  # 0b0000
+    0.00260417: 1,  # 0b0001
+    0.0859375: 6,  # 0b0110
+    0.20833333: 7,  # 0b0111
+    0.29166667: 4,  # 0b0100
+    0.4166667: 5,  # 0b0101
+    0.583333: 2,  # 0b0010
+    0.8333333: 3,  # 0b0011
+}
+
+
+@_maybe_torch_compile
+def quantize_4bit_impl(
+    A: Tensor,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize=64,
+    compress_statistics=False,
+    quant_type="nf4",
+) -> Tensor:
+    """
+    Quantize tensor A in blocks of 4-bit values.
+
+    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input tensor.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        The output tensor (8-bit).
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+
+    Returns
+    -------
+    torch.Tensor:
+        The 8-bit tensor with packed 4-bit values.
+    tuple(torch.Tensor, torch.Size, torch.dtype, int):
+        The quantization state to undo the quantization.
+    """
+    if quant_type not in ["nf4", "fp4"]:
+        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented for CPU/XPU.")
+    if quant_type == "fp4":
+        warnings.warn("fp4 quantization is currently slow on CPU/XPU. Please Use nf4 instead for better performance.")
+    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+    n = A.numel()
+    input_shape = A.shape
+    blocks = n // blocksize
+    blocks += 1 if n % blocksize > 0 else 0
+
+    if absmax is None:
+        absmax = torch.zeros((blocks,), device=A.device, dtype=A.dtype)
+
+    if out is None:
+        out = torch.zeros(((n + 1) // 2), dtype=torch.uint8, device=A.device)
+
+    rem = n % blocksize
+    has_rem = rem > 0
+
+    # Scale tensor to [-1, 1]
+    A_reshaped = A.reshape(n)
+    A_com = A_reshaped[: n - rem]
+    A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
+    absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
+    scaled_A = torch.clamp(A_com_reshaped * (1 / absmax[: blocks - has_rem].view(-1, 1)), -1, 1)
+    scaled_A = scaled_A.reshape(-1)
+    if has_rem:
+        absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
+        scaled_A_rem = torch.clamp(A_reshaped[n - rem :] * (1 / absmax[-1]), -1, 1)
+        scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
+    # map [-1, 1] to nf4/fp4
+    out_uint8 = torch.empty(scaled_A.shape, dtype=torch.uint8)
+    if quant_type == "nf4":
+        for i in range(len(NF4_QUANT_TABLE)):
+            out_uint8[scaled_A > NF4_QUANT_TABLE[i]] = i
+    elif quant_type == "fp4":
+        sign = scaled_A < 0
+        abs_scaled_A = torch.abs(scaled_A)
+        for key, val in FP4_QUANT_TABLE.items():
+            out_uint8[abs_scaled_A > key] = val
+        out_uint8 += sign.to(torch.uint8) * 8
+    if out_uint8.size(-1) % 2:
+        out_uint8 = torch.nn.functional.pad(out_uint8, (0, 1), value=0)
+    out[:] = out_uint8[1::2].bitwise_left_shift(4).bitwise_or_(out_uint8[::2])
+
+    code = get_4bit_type(quant_type, device=A.device)
+
+    if compress_statistics:
+        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+    else:
+        state = QuantState(
+            absmax=absmax,
+            shape=input_shape,
+            dtype=A.dtype,
+            blocksize=blocksize,
+            code=code,
+            quant_type=quant_type,
+        )
+
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and input_shape[1] % blocksize == 0 and quant_type == "nf4":
+        # lowp_mode: lowest precision for computation
+        lowp_mode = ipex_cpu.quantization.WoqLowpMode.BF16
+        state.op_context = torch.ops.ipex_prepack.weight_only_qlinear_prepack(
+            out.reshape([input_shape[0], input_shape[1] // 2]),
+            ipex_cpu.quantization.WoqWeightDtype.NF4,
+            input_shape,  # weight shape
+            absmax.view(input_shape[0], input_shape[1] // blocksize),  # scales
+            None,  # zero_points
+            None,  # bias
+            None,  # g_idx
+            None,  # batch_size
+            blocksize,
+            int(lowp_mode),
+            -1,  # act_quant_mode. -1 means don't quant activation
+        )
+        state.absmax = torch.Tensor()
+        return torch.Tensor(), state
+
+    return out, state
+
+
+@_maybe_torch_compile
+def dequantize_4bit_impl(
+    A: Tensor,
+    quant_state=None,
+    absmax: Tensor = None,
+    out: Tensor = None,
+    blocksize: int = 64,
+    quant_type="nf4",
+) -> Tensor:
+    """
+    Dequantizes FP4 blockwise quantized values.
+
+    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The input 8-bit tensor (packed 4-bit values).
+    quant_state : QuantState
+        object with quantisation stats, incl. absmax values, original tensor shape and original dtype.
+    absmax : torch.Tensor
+        The absmax values.
+    out : torch.Tensor
+        Dequantized output tensor.
+    blocksize : int
+        The blocksize used in quantization.
+    quant_type : str
+        The 4-bit quantization data type {fp4, nf4}, only nf4 is supported now
+
+
+    Returns
+    -------
+    torch.Tensor:
+        Dequantized tensor.
+    """
+
+    if quant_state is None:
+        assert absmax is not None and out is not None
+
+        quant_state = QuantState(
+            absmax=absmax,
+            shape=out.shape,
+            dtype=out.dtype,
+            blocksize=blocksize,
+            quant_type=quant_type,
+        )
+
+    else:
+        absmax = quant_state.absmax
+
+    if quant_type not in ["nf4", "fp4"]:
+        raise NotImplementedError(
+            f"4-bit quantization data type {quant_state.quant_type} is not implemented for CPU/XPU."
+        )
+
+    if quant_state.nested:
+        raise NotImplementedError("bnb_4bit_use_double_quant is not supported yet for CPU/XPU")
+
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(quant_state, "op_context"):
+        assert quant_state.op_context is not None
+        A = quant_state.op_context.to_public(quant_state.op_context.get_weight())
+        A = A.reshape(-1)
+        absmax = quant_state.op_context.get_scales().reshape(-1)
+
+    if out is None:
+        out = torch.empty(quant_state.shape, dtype=quant_state.dtype, device=A.device)
+
+    n = out.numel()
+    # Map nf4 to [-1, 1]
+    out_uint8 = torch.empty(A.size(0) * 2, dtype=torch.uint8, device=A.device)
+    out_uint8[::2] = A.bitwise_and(0xF)
+    out_uint8[1::2] = A.bitwise_right_shift(4)
+    out_dq = torch.empty(out_uint8.shape).to(quant_state.dtype)
+    for i in range(len(quant_state.code)):
+        out_dq[out_uint8 == i] = quant_state.code[i]
+
+    # Apply scales
+    if out_dq.numel() != n:
+        assert out_dq.numel() == n + 1
+        out_dq = torch.narrow(out_dq, 0, 0, n)
+    blocks = n // blocksize
+    blocks += 1 if n % blocksize > 0 else 0
+    rem = n % blocksize
+    has_rem = rem > 0
+    out_reshaped = out.reshape(-1)
+    out_reshaped[: n - rem] = (out_dq[: n - rem].view(-1, blocksize) * absmax[: blocks - has_rem].view(-1, 1)).reshape(
+        -1
+    )
+    if has_rem:
+        out_reshaped[n - rem :] = out_dq[n - rem :] * absmax[-1]
+
+    # take transpose here because weight is transposed (again) for computation
+    return out.t()
+
+
+# Do not need torch.compile here as we are calling torch/ipex kernel
+def gemm_4bit_impl(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    out: Optional[torch.Tensor] = None,
+    transposed_A=False,
+    transposed_B=False,
+    state: QuantState = None,
+) -> torch.Tensor:
+    """
+    Matrix-matrix multiplication with 4-bit quantization.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The first input tensor. Usually the activation tensor.
+    B : torch.Tensor
+        The second input tensor. Usually the weight tensor.
+    out : torch.Tensor
+        The output tensor.
+    transposed_A : bool
+        Whether A is transposed
+    transposed_B : bool
+        Whether B is transposed
+    state : QuantState
+        Contains quantization info, such as blocksize and dtype
+
+    Returns
+    -------
+    torch.Tensor:
+        GEMM output tensor.
+    """
+    if ipex_cpu and _ipex_cpu_version_prereq(2, 3) and hasattr(state, "op_context"):
+        assert state.op_context is not None
+        output = torch.ops.torch_ipex.ipex_woq_linear(A, state.op_context.get_data_handle())
+    else:
+        dqB = dequantize_4bit_impl(B, state, blocksize=state.blocksize)
+        output = torch.matmul(A, dqB.to(A.dtype))
+    if out is not None:
+        out.copy_(output)
+    else:
+        out = output
+    return out
diff --git a/bitsandbytes/backends/npu.py b/bitsandbytes/backends/npu.py
new file mode 100644
index 000000000..1b3cb57d6
--- /dev/null
+++ b/bitsandbytes/backends/npu.py
@@ -0,0 +1,170 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+try:
+    # to support Ascend NPU backend
+    import torch_npu  # noqa: F401
+except ImportError:
+    pass
+
+
+class NPUBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 03d2cbd61..a096d0d51 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -38,9 +38,9 @@ def get_cuda_bnb_library_path(cuda_specs: CUDASpecs) -> Path:
     """
     if torch.version.hip:
         if BNB_HIP_VERSION < 601:
-            return PACKAGE_DIR / f"libbitsandbytes_hip_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}_nohipblaslt{DYNAMIC_LIBRARY_SUFFIX}"
         else:
-            return PACKAGE_DIR / f"libbitsandbytes_hip{DYNAMIC_LIBRARY_SUFFIX}"
+            return PACKAGE_DIR / f"libbitsandbytes_rocm{BNB_HIP_VERSION_SHORT}{DYNAMIC_LIBRARY_SUFFIX}"
     library_name = f"libbitsandbytes_cuda{cuda_specs.cuda_version_string}"
     if not cuda_specs.has_cublaslt:
         # if not has_cublaslt (CC < 7.5), then we have to choose _nocublaslt
@@ -119,8 +119,10 @@ def get_native_library() -> BNBNativeLibrary:
     if torch.version.hip:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
+        BNB_HIP_VERSION_SHORT = str(hip_major) + str(hip_minor)
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
+        BNB_HIP_VERSION_SHORT = ""
     lib = get_native_library()
 except Exception as e:
     lib = None
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 2041589b3..6cf64df28 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,6 +27,35 @@ def prod(iterable):
 
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
+    str2optimizer32bit = {
+        "adam": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            lib.cadam32bit_grad_bf16,
+        ),
+        "momentum": (
+            lib.cmomentum32bit_grad_32,
+            lib.cmomentum32bit_grad_16,
+        ),
+        "rmsprop": (
+            lib.crmsprop32bit_grad_32,
+            lib.crmsprop32bit_grad_16,
+        ),
+        "lion": (
+            lib.clion32bit_grad_fp32,
+            lib.clion32bit_grad_fp16,
+            lib.clion32bit_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
+        ),
+        "lamb": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+        ),
+    }
+
     str2optimizer8bit = {
         "adam": (
             lib.cadam_static_8bit_grad_32,
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 79b31f51f..c92b25e2c 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -240,7 +240,7 @@ def __new__(
         return self
 
     def __getstate__(self):
-        state = self.__dict__
+        state = self.__dict__.copy()
         state["data"] = self.data
         state["requires_grad"] = self.requires_grad
         return state
@@ -286,10 +286,13 @@ def from_prequantized(
         self.compress_statistics = self.quant_state.nested
         self.quant_type = self.quant_state.quant_type
         self.bnb_quantized = True
+
+        self.quant_storage = data.dtype
+
         return self
 
     def _quantize(self, device):
-        w = self.data.contiguous().cuda(device)
+        w = self.data.contiguous().to(device)
         w_4bit, quant_state = bnb.functional.quantize_4bit(
             w,
             blocksize=self.blocksize,
@@ -307,6 +310,9 @@ def _quantize(self, device):
     def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
         return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
 
+    def cpu(self, non_blocking: bool = False):
+        return self.to(device="cpu", non_blocking=non_blocking)
+
     @overload
     def to(
         self: T,
@@ -324,7 +330,7 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type == "cuda" and not self.bnb_quantized:
+        if device is not None and device.type in ["cuda", "cpu"] and not self.bnb_quantized:
             return self._quantize(device)
         else:
             if self.quant_state is not None:
@@ -337,6 +343,7 @@ def to(self, *args, **kwargs):
                 blocksize=self.blocksize,
                 compress_statistics=self.compress_statistics,
                 quant_type=self.quant_type,
+                quant_storage=self.quant_storage,
             )
 
             return new_param
@@ -454,7 +461,7 @@ def forward(self, x: torch.Tensor):
                 # since we registered the module, we can recover the state here
                 assert self.weight.shape[1] == 1
                 if not isinstance(self.weight, Params4bit):
-                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage)
+                    self.weight = Params4bit(self.weight, quant_storage=self.quant_storage, bnb_quantized=True)
                 self.weight.quant_state = self.quant_state
             else:
                 print(
@@ -564,13 +571,12 @@ def __new__(
         CB=None,
         SCB=None,
     ):
-        cls.has_fp16_weights = has_fp16_weights
-        cls.CB = None
-        cls.SCB = None
         if data is None:
             data = torch.empty(0)
         obj = torch.Tensor._make_subclass(cls, data, requires_grad)
-        obj.CB, obj.SCB = cls.CB, cls.SCB
+        obj.CB = CB
+        obj.SCB = SCB
+        obj.has_fp16_weights = has_fp16_weights
         return obj
 
     def cuda(self, device):
@@ -589,6 +595,18 @@ def cuda(self, device):
 
         return self
 
+    def __deepcopy__(self, memo):
+        # adjust this if new arguments are added to the constructor
+        new_instance = type(self).__new__(
+            type(self),
+            data=copy.deepcopy(self.data, memo),
+            requires_grad=self.requires_grad,
+            has_fp16_weights=self.has_fp16_weights,
+            CB=copy.deepcopy(self.CB, memo),
+            SCB=copy.deepcopy(self.SCB, memo),
+        )
+        return new_instance
+
     def cpu(self):
         # we store the 8-bit rows-major weight
         B = self.data.contiguous().bfloat16().cpu()
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index f1e60e5e7..e9c857d49 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -437,7 +437,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
             state["state2"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
@@ -474,6 +474,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 
@@ -656,7 +660,7 @@ def init_state(self, group, p, gindex, pindex):
         state = self.state[p]
         state["step"] = 0
 
-        if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
+        if dtype == torch.float32:
             state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
         elif dtype == torch.uint8:
             if state["step"] == 0:
@@ -685,6 +689,10 @@ def init_state(self, group, p, gindex, pindex):
 
     @torch.no_grad()
     def update_step(self, group, p, gindex, pindex):
+        # avoid update error from non-contiguous memory layout
+        p.data = p.data.contiguous()
+        p.grad = p.grad.contiguous()
+
         state = self.state[p]
         grad = p.grad
 
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index f4673359b..e4d459961 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -12,8 +12,6 @@
 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>
 #include <math_constants.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 #include <mma.h>
 
 
diff --git a/csrc/kernels.hip b/csrc/kernels.hip
index ca77dceda..d8d7cdba5 100644
--- a/csrc/kernels.hip
+++ b/csrc/kernels.hip
@@ -10,8 +10,6 @@
 #include <hipcub/hipcub.hpp>
 #include <hip/hip_math_constants.h>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
 //#include <mma.h>
 
 
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index da9df6af0..8b9a4f449 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -19,10 +19,6 @@
 #include <vector>
 #include <functional>
 
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-
-
 
 #define CUDA_CHECK_RETURN(value) {                      \
   cudaError_t _m_cudaStat = value;                    \
diff --git a/csrc/ops_hip.cuh b/csrc/ops_hip.cuh
index 1b9c13063..e57cbb3b5 100644
--- a/csrc/ops_hip.cuh
+++ b/csrc/ops_hip.cuh
@@ -21,12 +21,6 @@
 #include <vector>
 #include <functional>
 
-/*
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-*/
-
-
 #define CUDA_CHECK_RETURN(value) {                      \
   hipError_t _m_cudaStat = value;                    \
   if (_m_cudaStat != hipSuccess) {                   \
diff --git a/docs/source/fsdp_qlora.md b/docs/source/fsdp_qlora.md
index 47922cfcc..11e169ffb 100644
--- a/docs/source/fsdp_qlora.md
+++ b/docs/source/fsdp_qlora.md
@@ -9,25 +9,40 @@ This guide provides a brief guide on how bitsandbytes supports storing quantized
 
 ## Quantized data storage
 
-FSDP only supports sharding float data types which can be problematic because quantized weights are typically stored as integer data types (uint8). bitsandbytes doesn't have this problem because it uses `StoreChar` to read and write quantized weights regardless of the data type storage. This makes it simple to add a `quant_storage` parameter to the [`~nn.Linear4bit`] and [`~nn.Params4bit`] classes and set it to `torch.uint8` to maintain backward compatibility with the codebase.
+FSDP only supports sharding float data types which can be problematic because quantized weights are typically stored as integer data types (uint8). bitsandbytes doesn't have this problem because it uses `StoreChar` to read and write quantized weights regardless of the data type storage. This makes it simple to add a `quant_storage` parameter to the [`~nn.Linear4bit`] and [`~nn.Params4bit`] classes and set it to `torch.uint8` to maintain backward compatibility with the codebase. With the `quant_storage` parameter, you can select any of the FSDP supported data types to shard [`~nn.Linear4bit`] with such as bfloat16, float16 or float32.
+
+You'll typically access and configure this option from [`transformers.BitsAndBytesConfig`] by setting the `bnb_4bit_quant_storage` parameter. It is very **important** the `quant_storage` data type matches the data types used throughout the model because FSDP can only wrap layers and modules that have the *same floating data type*. Making sure the data types are aligned will ensure the model is correctly sharded.
+
+> [!TIP]
+> The `compute_dtype` is the data type used for computation inside the CUDA kernel, where the 4-bit quantized weights are unpacked from the data type in `quant_storage` and dequantized to `compute_dtype`. We recommend using torch.bfloat16 (if available on your hardware) for better numerical stability.
 
 ```py
-import torch
-import bitsandbytes as bnb
-
-model = bnb.nn.Linear4bit(
-    input_features,
-    output_features,
-    quant_type="fp4",
-    quant_storage=torch.uint8,
+from transformers import BitsAndBytesConfig, AutoModelForCausalLM
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_storage=torch.bfloat16,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-70b",
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16,
 )
 ```
 
-With the `quant_storage` parameter, you can select any of the FSDP supported data types to shard [`~nn.Linear4bit`] with such as bfloat16, float16 or float32.
+Check out this [section](https://hf.co/docs/peft/main/en/accelerate/fsdp#use-peft-qlora-and-fsdp-for-finetuning-large-models-on-multiple-gpus) of the PEFT documentation for the config file and training code to run FSDP-QLoRA training.
 
 ## Training
 
-bitsandbytes is deeply integrated with the Hugging Face ecosystem, making it easy to use with libraries like [Transformers](https://hf/co/docs/transformers), [PEFT](https://hf/co/docs/peft), and [TRL](https://hf/co/docs/trl).
+> [!TIP]
+> FSDP is a distributed training framework that needs to be launched as a distributed training job with a library like [Accelerate](https://hf.co/docs/accelerate/index) or [torchrun](https://pytorch.org/docs/stable/elastic/run.html). The launch command provided in this section uses Accelerate to launch the training script.
+
+bitsandbytes is deeply integrated with the Hugging Face ecosystem, making it easy to use with libraries like [Transformers](https://hf.co/docs/transformers), [PEFT](https://hf.co/docs/peft), and [TRL](https://hf.co/docs/trl).
+
+PEFT provides a configuration file ([fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml)), launch command ([run_peft_qlora_fsdp.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_fsdp.sh)), and training script ([train.py](https://github.com/huggingface/peft/blob/main/examples/sft/train.py)) for running FSDP-QLoRA. To learn more, check out the [Use PEFT QLoRA and FSDP for finetuning large models on multiple GPUs](https://huggingface.co/docs/peft/main/en/accelerate/fsdp#use-peft-qlora-and-fsdp-for-finetuning-large-models-on-multiple-gpus) documentation. This section briefly covers the steps to run FSDP-QLoRA training.
 
 Before you begin, make sure you have the latest libraries installed.
 
@@ -35,9 +50,6 @@ Before you begin, make sure you have the latest libraries installed.
 pip install -U bitsandbytes accelerate transformers peft trl
 ```
 
-> [!TIP]
-> PEFT provides a configuration file ([fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml)), launch command ([run_peft_qlora_fsdp.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_fsdp.sh)), and training script ([train.py](https://github.com/huggingface/peft/blob/main/examples/sft/train.py)) for FSDP-QLoRA. To learn more, check out the [Use PEFT QLoRA and FSDP for finetuning large models on multiple GPUs](https://huggingface.co/docs/peft/main/en/accelerate/fsdp#use-peft-qlora-and-fsdp-for-finetuning-large-models-on-multiple-gpus) documentation.
-
 The important change that enables FSDP-QLoRA training is the `bnb_4bit_quant_storage` parameter in the [`~transformers.BitsAndBytesConfig`] class. This allows you to set the storage data type of the quantized weights to a float data type.
 
 ```py
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index caf22488f..3ed694ac1 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,8 +1,10 @@
 # Installation
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
+## CUDA
 
-The latest version of bitsandbytes (v0.43.0) builds on:
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.5**. However, there's a multi-backend effort under way which is currently in alpha release, check [the respective section below in case you're interested to help us with early feedback](#multi-backend).
+
+The latest version of bitsandbytes builds on:
 
 | OS | CUDA | Compiler |
 |---|---|---|
@@ -29,7 +31,7 @@ To install from PyPI.
 pip install bitsandbytes
 ```
 
-## Compile from source
+### Compile from source[[compile]]
 
 For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
 
@@ -59,7 +61,7 @@ git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 make
-pip install .
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
 > [!TIP]
@@ -83,7 +85,7 @@ git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
 pip install -r requirements-dev.txt
 cmake -DCOMPUTE_BACKEND=cuda -S .
 cmake --build . --config Release
-python -m build --wheel
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
 ```
 
 Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows.
@@ -91,7 +93,7 @@ Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com
 </hfoption>
 </hfoptions>
 
-## PyTorch CUDA versions
+### PyTorch CUDA versions
 
 Some bitsandbytes features may need a newer CUDA version than the one currently supported by PyTorch binaries from Conda and pip. In this case, you should follow these instructions to load a precompiled bitsandbytes binary.
 
@@ -105,7 +107,7 @@ Then locally install the CUDA version you need with this script from bitsandbyte
 ```bash
 wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/install_cuda.sh
 # Syntax cuda_install CUDA_VERSION INSTALL_PREFIX EXPORT_TO_BASH
-#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124}
+#   CUDA_VERSION in {110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125}
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
@@ -127,7 +129,78 @@ For example, to use a local install path:
 
 ```bash
 export BNB_CUDA_VERSION=117
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/tim/local/cuda-11.7
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/YOUR_USERNAME/local/cuda-11.7
 ```
 
 3. Now when you launch bitsandbytes with these environment variables, the PyTorch CUDA version is overridden by the new CUDA version (in this example, version 11.7) and a different bitsandbytes library is loaded.
+
+## Multi-backend preview release compilation[[multi-backend]]
+
+Please follow these steps to install bitsandbytes with device-specific backend support other than CUDA:
+
+<hfoptions id="backend">
+<hfoption id="AMD ROCm">
+
+### AMD GPU
+
+bitsandbytes is fully supported from ROCm 6.1 onwards (currently in alpha release).
+
+> [!TIP]
+> If you would like to install ROCm and PyTorch on bare metal, skip Docker steps and refer to our official guides at [ROCm installation overview](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/install-overview.html#rocm-install-overview) and [Installing PyTorch for ROCm](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/3rd-party/pytorch-install.html#using-wheels-package) (Step 3 of wheels build for quick installation). Please make sure to get PyTorch wheel for the installed ROCm version.
+
+```bash
+# Create a docker container with latest ROCm image, which includes ROCm libraries
+docker pull rocm/dev-ubuntu-22.04:6.1.2-complete
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video rocm/dev-ubuntu-22.04:6.1.2-complete
+apt-get update && apt-get install -y git && cd home
+
+# Install pytorch compatible with above ROCm version
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/rocm6.1/
+
+# Install bitsandbytes from PyPI
+# (This is supported on Ubuntu 22.04, Python 3.10, ROCm 6.1.0/6.1.1/6.1.2 and gpu arch - gfx90a, gfx942, gfx1100
+# Please install from source if your configuration doesn't match with these)
+pip install bitsandbytes
+
+# Install bitsandbytes from source
+# Clone bitsandbytes repo, ROCm backend is currently enabled on multi-backend-refactor branch
+git clone --depth 1 -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git && cd bitsandbytes/
+
+# Install dependencies
+pip install -r requirements-dev.txt
+
+# Compile & install
+apt-get install -y build-essential cmake  # install build tools dependencies, unless present
+cmake -DCOMPUTE_BACKEND=hip -S .  # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
+
+</hfoption>
+<hfoption id="Intel CPU + GPU">
+
+### Intel CPU
+
+> [!TIP]
+> Intel CPU backend only supports building from source; for now, please follow the instructions below.
+
+Similar to the CUDA case, you can compile bitsandbytes from source for Linux and Windows systems.
+
+The below commands are for Linux. For installing on Windows, please adapt the below commands according to the same pattern as described [the section above on compiling from source under the Windows tab](#compile).
+
+```
+git clone --depth 1 -b multi-backend-refactor https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
+pip install intel_extension_for_pytorch
+pip install -r requirements-dev.txt
+cmake -DCOMPUTE_BACKEND=cpu -S .
+make
+pip install -e .   # `-e` for "editable" install, when developing BNB (otherwise leave that out)
+```
+
+</hfoption>
+<hfoption id="Apple Silicon (MPS)">
+
+WIP
+
+</hfoption>
+</hfoptions>
diff --git a/docs/source/rocm_installation.mdx b/docs/source/rocm_installation.mdx
deleted file mode 100644
index 5d4381e7d..000000000
--- a/docs/source/rocm_installation.mdx
+++ /dev/null
@@ -1,46 +0,0 @@
-# ROCm Installation
-
-Please follow these steps to install bitsandbytes on ROCm.
-
-<hfoptions id="rocm_install">
-<hfoption id="latest">
-
-For latest installation:
-
-```bash
-git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . #Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
-make
-pip install .
-```
-
-<hfoption>
-<hfoption id="version specific">
-
-For ROCm specific versions:
-
-Install Dependencies:
-
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-
-Install bitsandbytes from [ROCm](https://github.com/ROCm/bitsandbytes) repo:
-
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-<hfoption>
-</hfoptions>
diff --git a/install_cuda.py b/install_cuda.py
index cf7c8ee71..8267c5e2b 100644
--- a/install_cuda.py
+++ b/install_cuda.py
@@ -17,7 +17,8 @@
     "121": "https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run",
     "122": "https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run",
     "123": "https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run",
-    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run",
+    "124": "https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run",
+    "125": "https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run",
 }
 
 
diff --git a/install_cuda.sh b/install_cuda.sh
index 2e7fe8ed2..0aa9531fc 100644
--- a/install_cuda.sh
+++ b/install_cuda.sh
@@ -11,7 +11,8 @@ URL120=https://developer.download.nvidia.com/compute/cuda/12.0.1/local_installer
 URL121=https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
 URL122=https://developer.download.nvidia.com/compute/cuda/12.2.2/local_installers/cuda_12.2.2_535.104.05_linux.run
 URL123=https://developer.download.nvidia.com/compute/cuda/12.3.2/local_installers/cuda_12.3.2_545.23.08_linux.run
-URL124=https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+URL124=https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+URL125=https://developer.download.nvidia.com/compute/cuda/12.5.0/local_installers/cuda_12.5.0_555.42.02_linux.run
 
 CUDA_VERSION=$1
 BASE_PATH=$2
@@ -60,11 +61,14 @@ if [[ -n "$CUDA_VERSION" ]]; then
   elif [[ "$CUDA_VERSION" -eq "124" ]]; then
     URL=$URL124
     FOLDER=cuda-12.4
+  elif [[ "$CUDA_VERSION" -eq "125" ]]; then
+    URL=$URL125
+    FOLDER=cuda-12.5
   else
-    echo "argument error: No cuda version passed as input. Choose among versions 110 to 124"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
   fi
 else
-    echo "argument error: No cuda version passed as input. Choose among versions 92 to 123"
+    echo "argument error: No cuda version passed as input. Choose among versions 110 to 125"
 fi
 
 FILE=$(basename $URL)
diff --git a/requirements-ci.txt b/requirements-ci.txt
index 24e2db324..182e1023e 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,6 +1,6 @@
 # Requirements used for GitHub actions
-pytest==8.2.0
+pytest==8.3.1
 einops==0.8.0
-lion-pytorch==0.1.4
+lion-pytorch==0.2.2
 scipy==1.10.1; python_version < "3.9"
-scipy==1.13.0; python_version >= "3.9"
+scipy==1.14.0; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 0334896be..41211880c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,9 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.2.0
+pytest~=8.3.1
 einops~=0.8.0
 wheel~=0.43.0
-lion-pytorch~=0.1.4
-scipy~=1.13.0
+lion-pytorch~=0.2.2
+scipy~=1.14.0
 pandas~=2.2.2
-matplotlib~=3.8.4
+matplotlib~=3.9.1
diff --git a/setup.py b/setup.py
index f8d6a92a1..18de0fe5b 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.43.2.dev",
+    version="0.43.3.dev",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 8ddee9f9a..a9d926b89 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -584,6 +584,9 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
 @pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
 def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
+    if HIP_ENVIRONMENT and device == "cpu":
+        pytest.skip("this test is not supported on ROCm yet")
+
     for i in range(k):
         if dims == 2:
             A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)
@@ -2017,7 +2020,8 @@ def test_bench_dequantization():
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
 @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
 @pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
-def test_4bit_quant(dtype, quant_type, blocksize):
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+def test_4bit_quant(dtype, quant_type, blocksize, device):
     vals = list(product([0, 1], repeat=4))
 
     code = {}
@@ -2041,9 +2045,11 @@ def test_4bit_quant(dtype, quant_type, blocksize):
             result = sign * exp * frac
         code[idx] = result
 
-    A1 = torch.randn(1024, 1024, device="cuda", dtype=dtype)
+    A1 = torch.randn(1024, 1024, device=device, dtype=dtype)
     qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
     A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
+    if device == "cpu":
+        A2 = A2.t()
 
     err = (A1 - A2).abs().float()
     relerr = (err / (A1.abs().float() + 1e-8)).mean()
@@ -2297,6 +2303,49 @@ def test_gemv_4bit(dtype, storage_type, quant_storage, double_quant, kind):
             assert maxratio < 1.02 and maxratio > 0.98
 
 
+@pytest.mark.parametrize("kind", ["fc1", "fc2", "attn", "attn_packed"])
+@pytest.mark.parametrize("quant_type", ["nf4", "fp4"])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32], ids=describe_dtype)
+def test_gemv_4bit_cpu(dtype, quant_type, kind):
+    """
+    Test 4bit GEMV for CPU. It is simplified a lot from the cuda version, since
+    the CPU backend does not support double_quant or quant_storage other than uint8.
+    Also, the CPU backend has different numeric accuracy from that of CUDA
+    """
+    for dim in [128, 256, 512, 1024]:
+        for i in range(10):
+            if kind == "fc1":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim * 4, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "fc2":
+                A = torch.randn(1, 4 * dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim, 4 * dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "attn":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+            elif kind == "attn_packed":
+                A = torch.randn(1, dim, dtype=dtype, device="cpu")
+                B = torch.randn(dim * 3, dim, dtype=dtype, device="cpu") / math.sqrt(dim)
+
+            qB, state = F.quantize_4bit(
+                B,
+                quant_type=quant_type,
+                compress_statistics=False,
+                quant_storage=torch.uint8,
+            )
+            dqB = F.dequantize_4bit(qB, state)
+            C3 = torch.matmul(A, dqB)
+            C2 = F.gemv_4bit(A, qB.t(), state=state)
+            A.requires_grad = True
+            C1 = bnb.matmul_4bit(A, qB.t(), state)
+
+            c = int(C1.numel() * 0.0014 * (dim / 256)) + 1
+            rtol = 1e-3 if dtype != torch.bfloat16 else 1e-2
+            atol = 1e-2 if dtype != torch.bfloat16 else 5e-2
+            assert_all_approx_close(C1, C2, rtol, atol, count=c)
+            assert_all_approx_close(C3, C2, rtol, atol, count=c)
+
+
 @pytest.mark.skip("Row scale has some bugs for ampere")
 def test_managed():
     n = 32 * 10
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index bbbd05335..2f094be27 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -186,19 +186,30 @@ def test_copy_param():
 def test_deepcopy_param():
     tensor = torch.tensor([1.0, 2.0, 3.0, 4.0])
     param = bnb.nn.Params4bit(data=tensor, requires_grad=False).cuda(0)
+    dict_keys_before = set(param.__dict__.keys())
     copy_param = copy.deepcopy(param)
+    dict_keys_after = set(param.__dict__.keys())
+    dict_keys_copy = set(copy_param.__dict__.keys())
+
     assert param.quant_state is not copy_param.quant_state
     assert param.data.data_ptr() != copy_param.data.data_ptr()
 
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_copy
+
 
 def test_params4bit_real_serialization():
     original_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], dtype=torch.float32)
     original_param = bnb.nn.Params4bit(data=original_tensor, quant_type="fp4")
+    dict_keys_before = set(original_param.__dict__.keys())
 
     original_param.cuda(0)  # move to CUDA to trigger quantization
 
     serialized_param = pickle.dumps(original_param)
     deserialized_param = pickle.loads(serialized_param)
+    dict_keys_after = set(original_param.__dict__.keys())
+    dict_keys_deserialized = set(deserialized_param.__dict__.keys())
 
     assert torch.equal(original_param.data, deserialized_param.data)
     assert original_param.requires_grad == deserialized_param.requires_grad == False
@@ -206,3 +217,7 @@ def test_params4bit_real_serialization():
     assert original_param.blocksize == deserialized_param.blocksize
     assert original_param.compress_statistics == deserialized_param.compress_statistics
     assert original_param.quant_state == deserialized_param.quant_state
+
+    # there was a bug where deepcopy would modify the original object
+    assert dict_keys_before == dict_keys_after
+    assert dict_keys_before == dict_keys_deserialized
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 2a4bd02e2..c4409cc2e 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -1,5 +1,7 @@
 from contextlib import nullcontext
+import copy
 import os
+import pickle
 from tempfile import TemporaryDirectory
 
 import pytest
@@ -181,3 +183,59 @@ def test_linear_serialization(
         assert torch.allclose(x_first.grad, x_second.grad, atol=1e-5)
     assert torch.allclose(fx_first, fx_third, atol=1e-5)
     assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5)
+
+
+@pytest.fixture
+def linear8bit(requires_cuda):
+    linear = torch.nn.Linear(32, 96)
+    linear_custom = Linear8bitLt(
+        linear.in_features,
+        linear.out_features,
+        linear.bias is not None,
+        has_fp16_weights=False,
+        threshold=6.0,
+    )
+    linear_custom.weight = bnb.nn.Int8Params(
+        linear.weight.data.clone(),
+        requires_grad=False,
+        has_fp16_weights=False,
+    )
+    linear_custom.bias = linear.bias
+    linear_custom = linear_custom.cuda()
+    return linear_custom
+
+
+def test_linear8bit_copy_param(linear8bit):
+    shallow_copy = copy.copy(linear8bit)
+    assert linear8bit.weight is shallow_copy.weight
+    assert linear8bit.bias is shallow_copy.bias
+    assert linear8bit.weight.data.data_ptr() == shallow_copy.weight.data.data_ptr()
+
+
+def test_linear8bit_deepcopy_param(linear8bit):
+    deep_copy = copy.deepcopy(linear8bit)
+    assert linear8bit.weight is not deep_copy.weight
+    assert linear8bit.bias is not deep_copy.bias
+    assert linear8bit.weight.data.data_ptr() != deep_copy.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deep_copy.weight.data)
+    assert linear8bit.state == deep_copy.state
+
+    # check for a bug where SCB and CB were not copied
+    assert deep_copy.weight.SCB is not None
+    assert (linear8bit.weight.SCB == deep_copy.weight.SCB).all()
+    assert deep_copy.weight.CB is not None
+    assert (linear8bit.weight.CB == deep_copy.weight.CB).all()
+
+
+def test_linear8bit_serialization(linear8bit):
+    serialized = pickle.dumps(linear8bit)
+    deserialized = pickle.loads(serialized)
+    assert linear8bit.weight.data.data_ptr() != deserialized.weight.data.data_ptr()
+    assert torch.allclose(linear8bit.weight.data, deserialized.weight.data)
+    assert linear8bit.bias.data.data_ptr() != deserialized.bias.data.data_ptr()
+    assert torch.allclose(linear8bit.bias.data, deserialized.bias.data)
+    assert linear8bit.state == deserialized.state
+
+    # check for a bug where SCB and CB were not copied
+    assert (linear8bit.weight.SCB == deserialized.weight.SCB).all()
+    assert (linear8bit.weight.CB == deserialized.weight.CB).all()
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 8235b600c..1947ba52d 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -620,7 +620,7 @@ def test_fp8linear():
     assert bgraderr < 0.00002
 
 
-def test_4bit_warnings():
+def test_4bit_warnings(requires_cuda):
     dim1 = 64
 
     with pytest.warns(UserWarning, match=r"inference or training"):