diff --git a/bitsandbytes/cextension.py b/bitsandbytes/cextension.py
index 3d08c9084..e322693b5 100644
--- a/bitsandbytes/cextension.py
+++ b/bitsandbytes/cextension.py
@@ -120,7 +120,7 @@ def get_native_library() -> BNBNativeLibrary:
         hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
         HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
         BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
-        BNB_BACKEND = "ROCM"
+        BNB_BACKEND = "ROCm"
     else:
         HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
         BNB_HIP_VERSION_SHORT = ""
diff --git a/bitsandbytes/diagnostics/cuda.py b/bitsandbytes/diagnostics/cuda.py
index 6c66c6219..014b753a9 100644
--- a/bitsandbytes/diagnostics/cuda.py
+++ b/bitsandbytes/diagnostics/cuda.py
@@ -5,7 +5,7 @@
 
 import torch
 
-from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT, get_cuda_bnb_library_path
+from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
 from bitsandbytes.consts import NONPYTORCH_DOC_URL
 from bitsandbytes.cuda_specs import CUDASpecs
 from bitsandbytes.diagnostics.utils import print_dedented
@@ -32,16 +32,18 @@
     "_",  # current Python interpreter
 }
 
-CUDA_RUNTIME_LIB_PATTERNS = (
-    "cudart64*.dll",  # Windows
-    "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
-    "nvcuda*.dll",  # Windows
-)
+logger = logging.getLogger(__name__)
 
-if HIP_ENVIRONMENT:
-    CUDA_RUNTIME_LIB_PATTERNS = ("libamdhip64.so*",)
 
-logger = logging.getLogger(__name__)
+def get_runtime_lib_patterns() -> tuple:
+    if HIP_ENVIRONMENT:
+        return ("libamdhip64.so*",)
+    else:
+        return (
+            "cudart64*.dll",  # Windows
+            "libcudart*.so*",  # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
+            "nvcuda*.dll",  # Windows
+        )
 
 
 def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
@@ -58,8 +60,8 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
                     continue
             except OSError:  # Assume an esoteric error trying to poke at the directory
                 pass
-            for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
-                for pth in dir.rglob(lib_pattern):
+            for lib_pattern in get_runtime_lib_patterns():
+                for pth in dir.glob(lib_pattern):
                     if pth.is_file() and not pth.is_symlink():
                         yield pth
         except (OSError, PermissionError):
@@ -107,59 +109,38 @@ def find_cudart_libraries() -> Iterator[Path]:
         yield from find_cuda_libraries_in_path_list(value)
 
 
-def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
-    if not HIP_ENVIRONMENT:
-        print(
-            f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
-            f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
-        )
-    else:
-        print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
+def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
+    print(
+        f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
+        f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
+    )
 
     binary_path = get_cuda_bnb_library_path(cuda_specs)
     if not binary_path.exists():
-        if not HIP_ENVIRONMENT:
-            print_dedented(
-                f"""
-            Library not found: {binary_path}. Maybe you need to compile it from source?
-            If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
-            for example, `make CUDA_VERSION=113`.
-
-            The CUDA version for the compile might depend on your conda install, if using conda.
-            Inspect CUDA version via `conda list | grep cuda`.
-            """,
-            )
-        else:
-            print_dedented(
-                f"""
-            Library not found: {binary_path}.
-            Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
-            in PyTorch Settings matches your ROCM install. If not, reinstall PyTorch for your ROCm version
-            and rebuild bitsandbytes.
-            """,
-            )
+        print_dedented(
+            f"""
+        Library not found: {binary_path}. Maybe you need to compile it from source?
+        If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION`,
+        for example, `make CUDA_VERSION=113`.
+
+        The CUDA version for the compile might depend on your conda install, if using conda.
+        Inspect CUDA version via `conda list | grep cuda`.
+        """,
+        )
 
     cuda_major, cuda_minor = cuda_specs.cuda_version_tuple
-    if not HIP_ENVIRONMENT:
-        if cuda_major < 11:
-            print_dedented(
-                """
-                WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
-                You will be only to use 8-bit optimizers and quantization routines!
-                """,
-            )
-
-        print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")
-    else:
-        if (cuda_major, cuda_minor) < (6, 1):
-            print_dedented(
-                """
-                WARNING: bitandbytes is fully supported only from ROCm 6.1.
-                """,
-            )
+    if cuda_major < 11:
+        print_dedented(
+            """
+            WARNING: CUDA versions lower than 11 are currently not supported for LLM.int8().
+            You will be only to use 8-bit optimizers and quantization routines!
+            """,
+        )
+
+    print(f"To manually override the PyTorch CUDA version please see: {NONPYTORCH_DOC_URL}")
 
     # 7.5 is the minimum CC for cublaslt
-    if not cuda_specs.has_cublaslt and not HIP_ENVIRONMENT:
+    if not cuda_specs.has_cublaslt:
         print_dedented(
             """
             WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU!
@@ -173,44 +154,88 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
     # (2) Multiple CUDA versions installed
 
 
-def print_cuda_runtime_diagnostics() -> None:
+def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
+    print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")
+
+    binary_path = get_cuda_bnb_library_path(cuda_specs)
+    if not binary_path.exists():
+        print_dedented(
+            f"""
+        Library not found: {binary_path}.
+        Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
+        in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
+        and rebuild bitsandbytes.
+        """,
+        )
+
+    hip_major, hip_minor = cuda_specs.cuda_version_tuple
+    if (hip_major, hip_minor) < (6, 1):
+        print_dedented(
+            """
+            WARNING: bitsandbytes is fully supported only from ROCm 6.1.
+            """,
+        )
+
+
+def print_diagnostics(cuda_specs: CUDASpecs) -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_diagnostics(cuda_specs)
+    else:
+        _print_cuda_diagnostics(cuda_specs)
+
+
+def _print_cuda_runtime_diagnostics() -> None:
     cudart_paths = list(find_cudart_libraries())
     if not cudart_paths:
-        print(f"{BNB_BACKEND} SETUP: WARNING! {BNB_BACKEND} runtime files not found in any environmental path.")
+        print("WARNING! CUDA runtime files not found in any environmental path.")
     elif len(cudart_paths) > 1:
-        backend_version = torch.version.cuda if not HIP_ENVIRONMENT else torch.version.hip
         print_dedented(
             f"""
-            Found duplicate {BNB_BACKEND} runtime files (see below).
+            Found duplicate CUDA runtime files (see below).
+
+            We select the PyTorch default CUDA runtime, which is {torch.version.cuda},
+            but this might mismatch with the CUDA version that is needed for bitsandbytes.
+            To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.
+
+            For example, if you want to use the CUDA version 122,
+                BNB_CUDA_VERSION=122 python ...
+
+            OR set the environmental variable in your .bashrc:
+                export BNB_CUDA_VERSION=122
 
-            We select the PyTorch default {BNB_BACKEND} runtime, which is {backend_version},
-            but this might mismatch with the {BNB_BACKEND} version that is needed for bitsandbytes.
+            In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
+            """,
+        )
+        for pth in cudart_paths:
+            print(f"* Found CUDA runtime at: {pth}")
+
+
+def _print_hip_runtime_diagnostics() -> None:
+    cudart_paths = list(find_cudart_libraries())
+    if not cudart_paths:
+        print("WARNING! ROCm runtime files not found in any environmental path.")
+    elif len(cudart_paths) > 1:
+        print_dedented(
+            f"""
+            Found duplicate ROCm runtime files (see below).
+
+            We select the PyTorch default ROCm runtime, which is {torch.version.hip},
+            but this might mismatch with the ROCm version that is needed for bitsandbytes.
+
+            To resolve it, install PyTorch built for the ROCm version you want to use
+
+            and set LD_LIBRARY_PATH to your ROCm install path, e.g.
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
             """,
         )
-        if not HIP_ENVIRONMENT:
-            print_dedented(
-                """
-                To override this behavior set the `BNB_CUDA_VERSION=<version string, e.g. 122>` environmental variable.
-
-                For example, if you want to use the CUDA version 122,
-                    BNB_CUDA_VERSION=122 python ...
-
-                OR set the environmental variable in your .bashrc:
-                    export BNB_CUDA_VERSION=122
-
-                In the case of a manual override, make sure you set LD_LIBRARY_PATH, e.g.
-                export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2,
-                """,
-            )
-        else:
-            print_dedented(
-                """
-                To resolve it, install PyTorch built for the ROCm version you want to use
-
-                and set LD_LIBRARY_PATH to your ROCm install path, e.g.
-                export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/rocm-6.1.2,
-                """,
-            )
 
         for pth in cudart_paths:
-            print(f"* Found {BNB_BACKEND} runtime at: {pth}")
+            print(f"* Found ROCm runtime at: {pth}")
+
+
+def print_runtime_diagnostics() -> None:
+    if HIP_ENVIRONMENT:
+        _print_hip_runtime_diagnostics()
+    else:
+        _print_cuda_runtime_diagnostics()
diff --git a/bitsandbytes/diagnostics/main.py b/bitsandbytes/diagnostics/main.py
index 9165cbeed..8dc43ed2a 100644
--- a/bitsandbytes/diagnostics/main.py
+++ b/bitsandbytes/diagnostics/main.py
@@ -7,8 +7,8 @@
 from bitsandbytes.consts import PACKAGE_GITHUB_URL
 from bitsandbytes.cuda_specs import get_cuda_specs
 from bitsandbytes.diagnostics.cuda import (
-    print_cuda_diagnostics,
-    print_cuda_runtime_diagnostics,
+    print_diagnostics,
+    print_runtime_diagnostics,
 )
 from bitsandbytes.diagnostics.utils import print_dedented, print_header
 
@@ -63,8 +63,8 @@ def main():
         print(f"2. {BNB_BACKEND} not installed")
         print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
     if cuda_specs:
-        print_cuda_diagnostics(cuda_specs)
-    print_cuda_runtime_diagnostics()
+        print_diagnostics(cuda_specs)
+    print_runtime_diagnostics()
     print_header("")
     print_header("DEBUG INFO END")
     print_header("")
diff --git a/csrc/ops.hip b/csrc/ops.hip
index a808d5ecb..4fdc3cbfa 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -618,7 +618,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
       if (returnedAlgoCount == 0)
       {
         has_error = 1;
-        printf("Error: Matmul Algo Heurisitic didn't return algorithms\n");
+        fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
       }
       else
       {