Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions cuda_bindings/tests/nvml/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,9 @@ def get_devices(device_info):


@pytest.fixture
def for_all_devices(device_info):
def all_devices(device_info):
with NVMLInitializer():
unique_devices = set()
for device_id in get_devices(device_info):
if device_id not in unique_devices:
unique_devices.add(device_id)
yield device_id
# RestoreDefaultEnvironment.restore()
yield sorted(list(set(get_devices(device_info))))


@pytest.fixture
Expand Down
28 changes: 16 additions & 12 deletions cuda_bindings/tests/nvml/test_compute_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,19 @@


@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
def test_compute_mode_supported_nonroot(for_all_devices):
device = for_all_devices

try:
original_compute_mode = nvml.device_get_compute_mode(device)
except nvml.NotSupportedError:
pytest.skip("nvmlDeviceGetComputeMode not supported")

for cm in COMPUTE_MODES:
with pytest.raises(nvml.NoPermissionError):
nvml.device_set_compute_mode(device, cm)
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
def test_compute_mode_supported_nonroot(all_devices):
skip_reasons = set()
for device in all_devices:
try:
original_compute_mode = nvml.device_get_compute_mode(device)
except nvml.NotSupportedError:
skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
continue

for cm in COMPUTE_MODES:
with pytest.raises(nvml.NoPermissionError):
nvml.device_set_compute_mode(device, cm)
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"

if skip_reasons:
pytest.skip(" ; ".join(skip_reasons))
29 changes: 17 additions & 12 deletions cuda_bindings/tests/nvml/test_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,24 @@ def test_gpu_get_module_id(nvml_init):
assert isinstance(module_id, int)


def test_gpu_get_platform_info(for_all_devices):
device = for_all_devices
def test_gpu_get_platform_info(all_devices):
skip_reasons = set()
for device in all_devices:
if util.is_vgpu(device):
skip_reasons.add(f"Not supported on vGPU device {device}")
continue

if util.is_vgpu(device):
pytest.skip("Not supported on vGPU device")
# TODO
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
# test_utils.skip_test("Not supported on chip before Blackwell")

# TODO
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
# test_utils.skip_test("Not supported on chip before Blackwell")
try:
platform_info = nvml.device_get_platform_info(device)
except nvml.NotSupportedError:
skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
continue

try:
platform_info = nvml.device_get_platform_info(device)
except nvml.NotSupportedError:
pytest.skip("Not supported returned, likely NVLink is disabled.")
assert isinstance(platform_info, nvml.PlatformInfo_v2)

assert isinstance(platform_info, nvml.PlatformInfo_v2)
if skip_reasons:
pytest.skip(" ; ".join(skip_reasons))
35 changes: 17 additions & 18 deletions cuda_bindings/tests/nvml/test_nvlink.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,25 @@
from cuda.bindings import _nvml as nvml


def test_nvlink_get_link_count(for_all_devices):
def test_nvlink_get_link_count(all_devices):
"""
Checks that the link count of the device is same.
"""
device = for_all_devices
for device in all_devices:
fields = nvml.FieldValue(1)
fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
value = nvml.device_get_field_values(device, fields)[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

fields = nvml.FieldValue(1)
fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT
value = nvml.device_get_field_values(device, fields)[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)
# Use the alternative argument to device_get_field_values
value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

# Use the alternative argument to device_get_field_values
value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0]
assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, (
f"Unexpected return {value.nvml_return} for link count field query"
)

# The feature_nvlink_supported detection is not robust, so we
# can't be more specific about how many links we should find.
if value.nvml_return == nvml.Return.SUCCESS:
assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}"
# The feature_nvlink_supported detection is not robust, so we
# can't be more specific about how many links we should find.
if value.nvml_return == nvml.Return.SUCCESS:
assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}"
59 changes: 36 additions & 23 deletions cuda_bindings/tests/nvml/test_page_retirement.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,42 +20,55 @@ def supports_page_retirement(device):
return False


def test_page_retirement_notsupported(for_all_devices):
def test_page_retirement_notsupported(all_devices):
"""
Verifies that on platforms that don't supports page retirement, APIs will return Not Supported
"""
device = for_all_devices
skip_reasons = set()

if supports_page_retirement(device):
pytest.skip("page_retirement not supported")
for device in all_devices:
if supports_page_retirement(device):
skip_reasons.add(f"page_retirement is supported for {device}")
continue

if not util.supports_ecc(device):
pytest.skip("device doesn't support ECC")
if not util.supports_ecc(device):
skip_reasons.add(f"device doesn't support ECC for {device}")
continue

with pytest.raises(nvml.NotSupportedError):
for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
nvml.device_get_retired_pages(device, source)
with pytest.raises(nvml.NotSupportedError):
for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
nvml.device_get_retired_pages(device, source)

with pytest.raises(nvml.NotSupportedError):
nvml.device_get_retired_pages_pending_status(device)
with pytest.raises(nvml.NotSupportedError):
nvml.device_get_retired_pages_pending_status(device)

if skip_reasons:
pytest.skip(" ; ".join(skip_reasons))

def test_page_retirement_supported(for_all_devices):

def test_page_retirement_supported(all_devices):
"""
Verifies that on platforms that support page_retirement, APIs will return success
"""
device = for_all_devices
skip_reasons = set()

if not supports_page_retirement(device):
pytest.skip("page_retirement not supported")
for device in all_devices:
if not supports_page_retirement(device):
skip_reasons.add(f"page_retirement not supported for {device}")
continue

if not util.supports_ecc(device):
pytest.skip("device doesn't support ECC")
if not util.supports_ecc(device):
skip_reasons.add(f"device doesn't support ECC for {device}")
continue

try:
for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
nvml.device_get_retired_pages(device, source)
except nvml.NotSupportedError:
pytest.skip("Exception case: Page retirment is not supported in this GPU")
try:
for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES:
nvml.device_get_retired_pages(device, source)
except nvml.NotSupportedError:
skip_reasons.add(f"Exception case: Page retirement is not supported in this GPU {device}")
continue

nvml.device_get_retired_pages_pending_status(device)

nvml.device_get_retired_pages_pending_status(device)
if skip_reasons:
pytest.skip(" ; ".join(skip_reasons))
Loading