Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
5891465
Add build job for rocm
pnunna93 Jun 19, 2024
d03a680
Add rocm build script
pnunna93 Jun 19, 2024
ec9000f
Copy shared obj file into output_dir
pnunna93 Jun 20, 2024
9b8c1da
upload build artifacts and enable wheels build
pnunna93 Jun 20, 2024
1413c5f
Remove cuda build temporarily
pnunna93 Jun 20, 2024
578b2f4
Merge pull request #38 from ROCm/enable_rocm_build_ci
pnunna93 Jun 21, 2024
fd655b0
Add ROCm version to .so filename
pnunna93 Jul 29, 2024
6b77f4c
Add rocm_version to whls build
pnunna93 Jul 29, 2024
78324b3
Revert "Remove cuda build temporarily"
pnunna93 Jul 29, 2024
c146b8b
Add rocm_version env var
pnunna93 Jul 29, 2024
953a383
Merge remote-tracking branch 'upstream/multi-backend-refactor' into e…
pnunna93 Jul 29, 2024
d6c3df4
Remove thrush header files
pnunna93 Jul 30, 2024
7e9a65c
Print node info
pnunna93 Jul 30, 2024
cdb209a
print cuda node info
pnunna93 Jul 30, 2024
77e1499
Revert "print cuda node info"
pnunna93 Jul 30, 2024
7c91909
Revert "Print node info"
pnunna93 Jul 30, 2024
b78b340
Add rocm arch to compile command
pnunna93 Jul 30, 2024
a62b9d4
Rename .so files to rocm
pnunna93 Jul 30, 2024
9059bff
Update default gpu arch
pnunna93 Jul 30, 2024
c5a406a
Skip cpu based igemmlt int tests on ROCm
pnunna93 Jul 30, 2024
9cbb5e1
Update Documentation
pnunna93 Jul 30, 2024
3580624
Update upstream repo name
pnunna93 Jul 30, 2024
3bde1b7
Update docs
pnunna93 Jul 30, 2024
b123125
Merge pull request #39 from ROCm/enable_rocm_whls
pnunna93 Jul 31, 2024
db1df72
Update string format
pnunna93 Jul 31, 2024
e498b4d
Remove pre-release option for torch install
pnunna93 Jul 31, 2024
7d2e027
Update pytorch install path
pnunna93 Aug 1, 2024
0c76b1c
Add messages for Heuristics error
pnunna93 Aug 16, 2024
714d9e9
Remove toolcache for disk space
pnunna93 Aug 16, 2024
ce77361
print disk usage
pnunna93 Aug 16, 2024
b87c2b9
Clean disk space for linux
pnunna93 Aug 16, 2024
828fdc6
Fix for ubuntu
pnunna93 Aug 16, 2024
5721601
Add sudo for apt clean
pnunna93 Aug 16, 2024
d58303f
Update clean up disk list
pnunna93 Aug 16, 2024
483e8ca
remove disk usage print
pnunna93 Aug 16, 2024
9d111df
Merge pull request #42 from ROCm/add_err_msg
pnunna93 Aug 24, 2024
52ba52e
Add BNB_BACKEND variable
pnunna93 Aug 24, 2024
755dfbe
Update diagnostic functions for ROCm
pnunna93 Aug 24, 2024
70c3d6b
Fix tuple error
pnunna93 Aug 25, 2024
7b038e9
Fix library detection bug for recursive and symlink cases
pnunna93 Aug 25, 2024
343c9fa
fix pre-commit errors
pnunna93 Aug 25, 2024
42cc717
Merge pull request #43 from ROCm/update_diagnostics
pnunna93 Aug 25, 2024
b22eb2e
Merge branch 'multi-backend-refactor' into device_abstraction
pnunna93 Aug 25, 2024
f2ea137
Remove recursive path lib search
pnunna93 Sep 11, 2024
ee6abed
Create function for runtime lib patterns
pnunna93 Sep 11, 2024
6f9cd26
Update logger format
pnunna93 Sep 12, 2024
570137c
Update error reporting
pnunna93 Sep 12, 2024
3380df4
Remove commented code
pnunna93 Sep 12, 2024
1c5bd4f
Update error reporting
pnunna93 Sep 12, 2024
4655a41
Merge branch 'device_abstraction' into fix_diagnostic_feedback
pnunna93 Sep 12, 2024
f39ff48
Update error reporting
pnunna93 Sep 12, 2024
f57addd
Create hip diagnostics functions
pnunna93 Sep 12, 2024
251a0e8
Fix Typo
pnunna93 Sep 12, 2024
260a3ac
Fix pre-commit checks
pnunna93 Sep 12, 2024
48bfb20
Merge pull request #45 from ROCm/fix_diagnostic_feedback
pnunna93 Sep 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,10 +116,23 @@ jobs:
uses: docker/setup-qemu-action@v2
- name: Clean up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
sudo rm -rf \
/usr/share/dotnet \
/opt/ghc \
"/usr/local/share/boost" \
"$AGENT_TOOLSDIRECTORY" \
/opt/hostedtoolcache \
/opt/google/chrome \
/opt/microsoft/msedge \
/opt/microsoft/powershell \
/opt/pipx \
/usr/lib/mono \
/usr/local/julia* \
/usr/local/lib/android \
/usr/local/lib/node_modules \
/usr/local/share/chromium \
/usr/local/share/powershell \
/usr/share/swift
Comment on lines +119 to +135
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this cleaning needed in the first place? I've never seen anything like it in any GitHub Actions workflow I've come across 🤔

Copy link
Contributor Author

@pnunna93 pnunna93 Sep 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Github runner runs into disk space issues during docker pull. Those applications are not used, so I deleted them to clear some space.

- name: Build C++
run: bash .github/scripts/build-rocm.sh
env:
Expand Down
11 changes: 7 additions & 4 deletions bitsandbytes/cextension.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def get_native_library() -> BNBNativeLibrary:
if cuda_binary_path.exists():
binary_path = cuda_binary_path
else:
logger.warning("Could not find the bitsandbytes CUDA binary at %r", cuda_binary_path)
logger.warning("Could not find the bitsandbytes %s binary at %r", BNB_BACKEND, cuda_binary_path)
logger.debug(f"Loading bitsandbytes native library from: {binary_path}")
dll = ct.cdll.LoadLibrary(str(binary_path))

Expand All @@ -120,21 +120,24 @@ def get_native_library() -> BNBNativeLibrary:
hip_major, hip_minor = map(int, torch.version.hip.split(".")[0:2])
HIP_ENVIRONMENT, BNB_HIP_VERSION = True, hip_major * 100 + hip_minor
BNB_HIP_VERSION_SHORT = f"{hip_major}{hip_minor}"
BNB_BACKEND = "ROCm"
else:
HIP_ENVIRONMENT, BNB_HIP_VERSION = False, 0
BNB_HIP_VERSION_SHORT = ""
BNB_BACKEND = "CUDA"

lib = get_native_library()
except Exception as e:
lib = None
logger.error(f"Could not load bitsandbytes native library: {e}", exc_info=True)
if torch.cuda.is_available():
logger.warning(
"""
CUDA Setup failed despite CUDA being available. Please run the following command to get more information:
f"""
{BNB_BACKEND} Setup failed despite {BNB_BACKEND} being available. Please run the following command to get more information:

python -m bitsandbytes

Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
Inspect the output of the command and see if you can locate {BNB_BACKEND} libraries. You might need to add them
to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues
""",
Expand Down
89 changes: 77 additions & 12 deletions bitsandbytes/diagnostics/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import torch

from bitsandbytes.cextension import get_cuda_bnb_library_path
from bitsandbytes.cextension import HIP_ENVIRONMENT, get_cuda_bnb_library_path
from bitsandbytes.consts import NONPYTORCH_DOC_URL
from bitsandbytes.cuda_specs import CUDASpecs
from bitsandbytes.diagnostics.utils import print_dedented
Expand All @@ -32,15 +32,20 @@
"_", # current Python interpreter
}

CUDA_RUNTIME_LIB_PATTERNS = (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)

logger = logging.getLogger(__name__)


def get_runtime_lib_patterns() -> tuple:
if HIP_ENVIRONMENT:
return ("libamdhip64.so*",)
else:
return (
"cudart64*.dll", # Windows
"libcudart*.so*", # libcudart.so, libcudart.so.11.0, libcudart.so.12.0, libcudart.so.12.1, libcudart.so.12.2 etc.
"nvcuda*.dll", # Windows
)


def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path]:
for dir_string in paths_list_candidate.split(os.pathsep):
if not dir_string:
Expand All @@ -55,9 +60,9 @@ def find_cuda_libraries_in_path_list(paths_list_candidate: str) -> Iterable[Path
continue
except OSError: # Assume an esoteric error trying to poke at the directory
pass
for lib_pattern in CUDA_RUNTIME_LIB_PATTERNS:
for lib_pattern in get_runtime_lib_patterns():
for pth in dir.glob(lib_pattern):
if pth.is_file():
if pth.is_file() and not pth.is_symlink():
yield pth
except (OSError, PermissionError):
pass
Expand Down Expand Up @@ -104,7 +109,7 @@ def find_cudart_libraries() -> Iterator[Path]:
yield from find_cuda_libraries_in_path_list(value)


def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
def _print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
print(
f"PyTorch settings found: CUDA_VERSION={cuda_specs.cuda_version_string}, "
f"Highest Compute Capability: {cuda_specs.highest_compute_capability}.",
Expand Down Expand Up @@ -149,10 +154,40 @@ def print_cuda_diagnostics(cuda_specs: CUDASpecs) -> None:
# (2) Multiple CUDA versions installed


def print_cuda_runtime_diagnostics() -> None:
def _print_hip_diagnostics(cuda_specs: CUDASpecs) -> None:
print(f"PyTorch settings found: ROCM_VERSION={cuda_specs.cuda_version_string}")

binary_path = get_cuda_bnb_library_path(cuda_specs)
if not binary_path.exists():
print_dedented(
f"""
Library not found: {binary_path}.
Maybe you need to compile it from source? If you compiled from source, check that ROCM_VERSION
in PyTorch Settings matches your ROCm install. If not, reinstall PyTorch for your ROCm version
and rebuild bitsandbytes.
""",
)

hip_major, hip_minor = cuda_specs.cuda_version_tuple
if (hip_major, hip_minor) < (6, 1):
print_dedented(
"""
WARNING: bitsandbytes is fully supported only from ROCm 6.1.
""",
)


def print_diagnostics(cuda_specs: CUDASpecs) -> None:
if HIP_ENVIRONMENT:
_print_hip_diagnostics(cuda_specs)
else:
_print_cuda_diagnostics(cuda_specs)


def _print_cuda_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print("CUDA SETUP: WARNING! CUDA runtime files not found in any environmental path.")
print("WARNING! CUDA runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
print_dedented(
f"""
Expand All @@ -174,3 +209,33 @@ def print_cuda_runtime_diagnostics() -> None:
)
for pth in cudart_paths:
print(f"* Found CUDA runtime at: {pth}")


def _print_hip_runtime_diagnostics() -> None:
cudart_paths = list(find_cudart_libraries())
if not cudart_paths:
print("WARNING! ROCm runtime files not found in any environmental path.")
elif len(cudart_paths) > 1:
print_dedented(
f"""
Found duplicate ROCm runtime files (see below).

We select the PyTorch default ROCm runtime, which is {torch.version.hip},
but this might mismatch with the ROCm version that is needed for bitsandbytes.

To resolve it, install PyTorch built for the ROCm version you want to use

and set LD_LIBRARY_PATH to your ROCm install path, e.g.
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm-6.1.2/lib,
""",
)

for pth in cudart_paths:
print(f"* Found ROCm runtime at: {pth}")


def print_runtime_diagnostics() -> None:
if HIP_ENVIRONMENT:
_print_hip_runtime_diagnostics()
else:
_print_cuda_runtime_diagnostics()
31 changes: 19 additions & 12 deletions bitsandbytes/diagnostics/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@

import torch

from bitsandbytes.cextension import BNB_BACKEND, HIP_ENVIRONMENT
from bitsandbytes.consts import PACKAGE_GITHUB_URL
from bitsandbytes.cuda_specs import get_cuda_specs
from bitsandbytes.diagnostics.cuda import (
print_cuda_diagnostics,
print_cuda_runtime_diagnostics,
print_diagnostics,
print_runtime_diagnostics,
)
from bitsandbytes.diagnostics.utils import print_dedented, print_header

Expand All @@ -16,12 +17,13 @@ def sanity_check():
from bitsandbytes.cextension import lib

if lib is None:
compute_backend = "cuda" if not HIP_ENVIRONMENT else "hip"
print_dedented(
"""
f"""
Couldn't load the bitsandbytes library, likely due to missing binaries.
Please ensure bitsandbytes is properly installed.

For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND=cuda -S .`.
For source installations, compile the binaries with `cmake -DCOMPUTE_BACKEND={compute_backend} -S .`.
See the documentation for more details if needed.

Trying a simple check anyway, but this will likely fail...
Expand Down Expand Up @@ -49,19 +51,24 @@ def main():

print_header("OTHER")
cuda_specs = get_cuda_specs()
print("CUDA specs:", cuda_specs)
if HIP_ENVIRONMENT:
rocm_specs = f" rocm_version_string='{cuda_specs.cuda_version_string}',"
rocm_specs += f" rocm_version_tuple={cuda_specs.cuda_version_tuple}"
print(f"{BNB_BACKEND} specs:{rocm_specs}")
else:
print(f"{BNB_BACKEND} specs:{cuda_specs}")
Comment on lines +54 to +59
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this smells even more like CudaSpecs and RocmSpecs should be separate dataclasses.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This requires updates to cextensions and additional testing. Could we use this workaround for now and make the change after alpha release?

if not torch.cuda.is_available():
print("Torch says CUDA is not available. Possible reasons:")
print("1. CUDA driver not installed")
print("2. CUDA not installed")
print("3. You have multiple conflicting CUDA libraries")
print(f"Torch says {BNB_BACKEND} is not available. Possible reasons:")
print(f"1. {BNB_BACKEND} driver not installed")
print(f"2. {BNB_BACKEND} not installed")
print(f"3. You have multiple conflicting {BNB_BACKEND} libraries")
if cuda_specs:
print_cuda_diagnostics(cuda_specs)
print_cuda_runtime_diagnostics()
print_diagnostics(cuda_specs)
print_runtime_diagnostics()
print_header("")
print_header("DEBUG INFO END")
print_header("")
print("Checking that the library is importable and CUDA is callable...")
print(f"Checking that the library is importable and {BNB_BACKEND} is callable...")
try:
sanity_check()
print("SUCCESS!")
Expand Down
26 changes: 17 additions & 9 deletions csrc/ops.hip
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
if (returnedAlgoCount == 0)
{
has_error = 1;
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
}
else
{
Expand Down Expand Up @@ -614,18 +615,25 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
heuristicResult,
&returnedAlgoCount));

if(!SCALE_ROWS)
if (returnedAlgoCount == 0)
{
float alpha = 1.0f, beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
has_error = 1;
fprintf(stderr, "Error: Matmul Algo Heuristic didn't return algorithms\n");
}
else
{
//has_error |= checkHipblasStatus(hipblasLtMatmulDescSetAttribute(matmulDesc, hipblasLt_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
float beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
if(!SCALE_ROWS)
{
float alpha = 1.0f, beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
}
else
{
float beta = 0.0f;

has_error |= checkHipblasStatus(hipblasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, &heuristicResult[0].algo, nullptr, 0, 0));
}
}
}

Expand All @@ -635,7 +643,7 @@ template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(hipblasLtHandl
if (Adesc) has_error |= checkHipblasStatus(hipblasLtMatrixLayoutDestroy(Adesc));
if (matmulDesc) has_error |= checkHipblasStatus(hipblasLtMatmulDescDestroy(matmulDesc));
if(has_error == 1)
printf("error detected");
fprintf(stderr, "error detected\n");

return has_error;
#endif // NO_HIPBLASLT
Expand Down