From 634abf32f33d2a8c8dcf1dd8f6d0f8923bdb6d47 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 9 Dec 2025 20:18:22 -0500 Subject: [PATCH] Queue up 'skip reasons' --- cuda_bindings/tests/nvml/conftest.py | 9 +-- cuda_bindings/tests/nvml/test_compute_mode.py | 28 +++++---- cuda_bindings/tests/nvml/test_gpu.py | 29 +++++---- cuda_bindings/tests/nvml/test_nvlink.py | 35 ++++++----- .../tests/nvml/test_page_retirement.py | 59 +++++++++++-------- 5 files changed, 88 insertions(+), 72 deletions(-) diff --git a/cuda_bindings/tests/nvml/conftest.py b/cuda_bindings/tests/nvml/conftest.py index 4d4a8bb5e4..9c674a3ee0 100644 --- a/cuda_bindings/tests/nvml/conftest.py +++ b/cuda_bindings/tests/nvml/conftest.py @@ -71,14 +71,9 @@ def get_devices(device_info): @pytest.fixture -def for_all_devices(device_info): +def all_devices(device_info): with NVMLInitializer(): - unique_devices = set() - for device_id in get_devices(device_info): - if device_id not in unique_devices: - unique_devices.add(device_id) - yield device_id - # RestoreDefaultEnvironment.restore() + yield sorted(list(set(get_devices(device_info)))) @pytest.fixture diff --git a/cuda_bindings/tests/nvml/test_compute_mode.py b/cuda_bindings/tests/nvml/test_compute_mode.py index b44c3dc61a..3a8079adbf 100644 --- a/cuda_bindings/tests/nvml/test_compute_mode.py +++ b/cuda_bindings/tests/nvml/test_compute_mode.py @@ -15,15 +15,19 @@ @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows") -def test_compute_mode_supported_nonroot(for_all_devices): - device = for_all_devices - - try: - original_compute_mode = nvml.device_get_compute_mode(device) - except nvml.NotSupportedError: - pytest.skip("nvmlDeviceGetComputeMode not supported") - - for cm in COMPUTE_MODES: - with pytest.raises(nvml.NoPermissionError): - nvml.device_set_compute_mode(device, cm) - assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed" +def test_compute_mode_supported_nonroot(all_devices): + skip_reasons = set() + for device in all_devices: + try: + original_compute_mode = nvml.device_get_compute_mode(device) + except nvml.NotSupportedError: + skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}") + continue + + for cm in COMPUTE_MODES: + with pytest.raises(nvml.NoPermissionError): + nvml.device_set_compute_mode(device, cm) + assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed" + + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) diff --git a/cuda_bindings/tests/nvml/test_gpu.py b/cuda_bindings/tests/nvml/test_gpu.py index 5d23bac350..cd82347088 100644 --- a/cuda_bindings/tests/nvml/test_gpu.py +++ b/cuda_bindings/tests/nvml/test_gpu.py @@ -22,19 +22,24 @@ def test_gpu_get_module_id(nvml_init): assert isinstance(module_id, int) -def test_gpu_get_platform_info(for_all_devices): - device = for_all_devices +def test_gpu_get_platform_info(all_devices): + skip_reasons = set() + for device in all_devices: + if util.is_vgpu(device): + skip_reasons.add(f"Not supported on vGPU device {device}") + continue - if util.is_vgpu(device): - pytest.skip("Not supported on vGPU device") + # TODO + # if device.feature_dict.board.chip < board_class.Architecture.Blackwell: + # test_utils.skip_test("Not supported on chip before Blackwell") - # TODO - # if device.feature_dict.board.chip < board_class.Architecture.Blackwell: - # test_utils.skip_test("Not supported on chip before Blackwell") + try: + platform_info = nvml.device_get_platform_info(device) + except nvml.NotSupportedError: + skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}") + continue - try: - platform_info = nvml.device_get_platform_info(device) - except nvml.NotSupportedError: - pytest.skip("Not supported returned, likely NVLink is disabled.") + assert isinstance(platform_info, nvml.PlatformInfo_v2) - assert isinstance(platform_info, nvml.PlatformInfo_v2) + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) diff --git a/cuda_bindings/tests/nvml/test_nvlink.py b/cuda_bindings/tests/nvml/test_nvlink.py index a1fdb63e95..14799898be 100644 --- a/cuda_bindings/tests/nvml/test_nvlink.py +++ b/cuda_bindings/tests/nvml/test_nvlink.py @@ -5,26 +5,25 @@ from cuda.bindings import _nvml as nvml -def test_nvlink_get_link_count(for_all_devices): +def test_nvlink_get_link_count(all_devices): """ Checks that the link count of the device is same. """ - device = for_all_devices + for device in all_devices: + fields = nvml.FieldValue(1) + fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT + value = nvml.device_get_field_values(device, fields)[0] + assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, ( + f"Unexpected return {value.nvml_return} for link count field query" + ) - fields = nvml.FieldValue(1) - fields[0].field_id = nvml.FI.DEV_NVLINK_LINK_COUNT - value = nvml.device_get_field_values(device, fields)[0] - assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, ( - f"Unexpected return {value.nvml_return} for link count field query" - ) + # Use the alternative argument to device_get_field_values + value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0] + assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, ( + f"Unexpected return {value.nvml_return} for link count field query" + ) - # Use the alternative argument to device_get_field_values - value = nvml.device_get_field_values(device, [nvml.FI.DEV_NVLINK_LINK_COUNT])[0] - assert value.nvml_return == nvml.Return.SUCCESS or value.nvml_return == nvml.Return.ERROR_NOT_SUPPORTED, ( - f"Unexpected return {value.nvml_return} for link count field query" - ) - - # The feature_nvlink_supported detection is not robust, so we - # can't be more specific about how many links we should find. - if value.nvml_return == nvml.Return.SUCCESS: - assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}" + # The feature_nvlink_supported detection is not robust, so we + # can't be more specific about how many links we should find. + if value.nvml_return == nvml.Return.SUCCESS: + assert value.value.ui_val <= nvml.NVLINK_MAX_LINKS, f"Unexpected link count {value.value.ui_val}" diff --git a/cuda_bindings/tests/nvml/test_page_retirement.py b/cuda_bindings/tests/nvml/test_page_retirement.py index afa5a56f5d..4aa5260443 100644 --- a/cuda_bindings/tests/nvml/test_page_retirement.py +++ b/cuda_bindings/tests/nvml/test_page_retirement.py @@ -20,42 +20,55 @@ def supports_page_retirement(device): return False -def test_page_retirement_notsupported(for_all_devices): +def test_page_retirement_notsupported(all_devices): """ Verifies that on platforms that don't supports page retirement, APIs will return Not Supported """ - device = for_all_devices + skip_reasons = set() - if supports_page_retirement(device): - pytest.skip("page_retirement not supported") + for device in all_devices: + if supports_page_retirement(device): + skip_reasons.add(f"page_retirement is supported for {device}") + continue - if not util.supports_ecc(device): - pytest.skip("device doesn't support ECC") + if not util.supports_ecc(device): + skip_reasons.add(f"device doesn't support ECC for {device}") + continue - with pytest.raises(nvml.NotSupportedError): - for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES: - nvml.device_get_retired_pages(device, source) + with pytest.raises(nvml.NotSupportedError): + for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES: + nvml.device_get_retired_pages(device, source) - with pytest.raises(nvml.NotSupportedError): - nvml.device_get_retired_pages_pending_status(device) + with pytest.raises(nvml.NotSupportedError): + nvml.device_get_retired_pages_pending_status(device) + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) -def test_page_retirement_supported(for_all_devices): + +def test_page_retirement_supported(all_devices): """ Verifies that on platforms that support page_retirement, APIs will return success """ - device = for_all_devices + skip_reasons = set() - if not supports_page_retirement(device): - pytest.skip("page_retirement not supported") + for device in all_devices: + if not supports_page_retirement(device): + skip_reasons.add(f"page_retirement not supported for {device}") + continue - if not util.supports_ecc(device): - pytest.skip("device doesn't support ECC") + if not util.supports_ecc(device): + skip_reasons.add(f"device doesn't support ECC for {device}") + continue - try: - for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES: - nvml.device_get_retired_pages(device, source) - except nvml.NotSupportedError: - pytest.skip("Exception case: Page retirment is not supported in this GPU") + try: + for source in PAGE_RETIREMENT_PUBLIC_CAUSE_TYPES: + nvml.device_get_retired_pages(device, source) + except nvml.NotSupportedError: + skip_reasons.add(f"Exception case: Page retirement is not supported in this GPU {device}") + continue + + nvml.device_get_retired_pages_pending_status(device) - nvml.device_get_retired_pages_pending_status(device) + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons))