From 2df0c31750874b463d8d3122c72d480783ddfe4a Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 13:15:49 -0500 Subject: [PATCH 1/7] cuda.core.system: Better checks for when we expect APIs to be unsupported --- cuda_bindings/tests/nvml/conftest.py | 37 +++ cuda_bindings/tests/nvml/test_compute_mode.py | 11 +- cuda_bindings/tests/nvml/test_gpu.py | 18 +- cuda_bindings/tests/nvml/test_pynvml.py | 33 +-- cuda_core/cuda/core/system/_device.pyx | 57 +---- cuda_core/docs/source/api.rst | 2 +- cuda_core/tests/system/conftest.py | 39 +++ cuda_core/tests/system/test_system_device.py | 238 +++++++----------- 8 files changed, 208 insertions(+), 227 deletions(-) diff --git a/cuda_bindings/tests/nvml/conftest.py b/cuda_bindings/tests/nvml/conftest.py index 9c674a3ee0..7b7a06f847 100644 --- a/cuda_bindings/tests/nvml/conftest.py +++ b/cuda_bindings/tests/nvml/conftest.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE from collections import namedtuple +from contextlib import contextmanager import pytest from cuda.bindings import _nvml as nvml @@ -128,3 +129,39 @@ def pci_info(ngpus, handles): pci_info = [nvml.device_get_pci_info_v3(handles[i]) for i in range(ngpus)] assert len(pci_info) == ngpus return pci_info + + +@contextmanager +def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str | None): + device_arch = nvml.device_get_architecture(device) + + if isinstance(expected_device_arch, nvml.DeviceArch): + expected_device_arch_int = int(expected_device_arch) + elif expected_device_arch == "FERMI": + expected_device_arch_int = 1 + else: + expected_device_arch_int = 0 + + if expected_device_arch is None or expected_device_arch == "HAS_INFOROM" or device_arch == nvml.DeviceArch.UNKNOWN: + # In this case, we don't /know/ if it will fail, but we are ok if it + # does or does not. + + # TODO: There are APIs that are documented as supported only if the + # device has an InfoROM, but I couldn't find a way to detect that. For + # now, they are just handled as "possibly failing". + + try: + yield + except nvml.NotSupportedError: + pytest.skip( + f"Unsupported call for device architecture {nvml.DeviceArch(device_arch).name} " + f"on device '{nvml.device_get_name(device)}'" + ) + elif int(device_arch) < expected_device_arch_int: + # In this case, we /know/ if will fail, and we want to assert that it does. + with pytest.raises(nvml.NotSupportedError): + yield + pytest.skip("Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}") + else: + # In this case, we /know/ it should work, and if it fails, the test should fail. + yield diff --git a/cuda_bindings/tests/nvml/test_compute_mode.py b/cuda_bindings/tests/nvml/test_compute_mode.py index 3a8079adbf..e217868499 100644 --- a/cuda_bindings/tests/nvml/test_compute_mode.py +++ b/cuda_bindings/tests/nvml/test_compute_mode.py @@ -7,6 +7,8 @@ import pytest from cuda.bindings import _nvml as nvml +from .conftest import unsupported_before + COMPUTE_MODES = [ nvml.ComputeMode.COMPUTEMODE_DEFAULT, nvml.ComputeMode.COMPUTEMODE_PROHIBITED, @@ -16,18 +18,11 @@ @pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows") def test_compute_mode_supported_nonroot(all_devices): - skip_reasons = set() for device in all_devices: - try: + with unsupported_before(device, None): original_compute_mode = nvml.device_get_compute_mode(device) - except nvml.NotSupportedError: - skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}") - continue for cm in COMPUTE_MODES: with pytest.raises(nvml.NoPermissionError): nvml.device_set_compute_mode(device, cm) assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed" - - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) diff --git a/cuda_bindings/tests/nvml/test_gpu.py b/cuda_bindings/tests/nvml/test_gpu.py index cd82347088..79c478a407 100644 --- a/cuda_bindings/tests/nvml/test_gpu.py +++ b/cuda_bindings/tests/nvml/test_gpu.py @@ -5,6 +5,7 @@ from cuda.bindings import _nvml as nvml from . import util +from .conftest import unsupported_before def test_gpu_get_module_id(nvml_init): @@ -23,23 +24,14 @@ def test_gpu_get_module_id(nvml_init): def test_gpu_get_platform_info(all_devices): - skip_reasons = set() for device in all_devices: if util.is_vgpu(device): - skip_reasons.add(f"Not supported on vGPU device {device}") - continue + pytest.skip(f"Not supported on vGPU device {device}") - # TODO - # if device.feature_dict.board.chip < board_class.Architecture.Blackwell: - # test_utils.skip_test("Not supported on chip before Blackwell") + # Documentation says Blackwell or newer only, but this does seem to pass + # on some newer GPUs. - try: + with unsupported_before(device, None): platform_info = nvml.device_get_platform_info(device) - except nvml.NotSupportedError: - skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}") - continue assert isinstance(platform_info, nvml.PlatformInfo_v2) - - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) diff --git a/cuda_bindings/tests/nvml/test_pynvml.py b/cuda_bindings/tests/nvml/test_pynvml.py index d075b6f682..4157bfb772 100644 --- a/cuda_bindings/tests/nvml/test_pynvml.py +++ b/cuda_bindings/tests/nvml/test_pynvml.py @@ -10,6 +10,7 @@ from cuda.bindings import _nvml as nvml from . import util +from .conftest import unsupported_before XFAIL_LEGACY_NVLINK_MSG = "Legacy NVLink test expected to fail." @@ -66,7 +67,8 @@ def test_device_get_handle_by_pci_bus_id(ngpus, pci_info): def test_device_get_memory_affinity(handles, scope): size = 1024 for handle in handles: - node_set = nvml.device_get_memory_affinity(handle, size, scope) + with unsupported_before(handle, nvml.DeviceArch.KEPLER): + node_set = nvml.device_get_memory_affinity(handle, size, scope) assert node_set is not None assert len(node_set) == size @@ -76,7 +78,8 @@ def test_device_get_memory_affinity(handles, scope): def test_device_get_cpu_affinity_within_scope(handles, scope): size = 1024 for handle in handles: - cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope) + with unsupported_before(handle, nvml.DeviceArch.KEPLER): + cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope) assert cpu_set is not None assert len(cpu_set) == size @@ -136,22 +139,22 @@ def test_device_get_p2p_status(handles, index): def test_device_get_power_usage(ngpus, handles): for i in range(ngpus): - try: + # Note: documentation says this is supported on Fermi or newer, + # but in practice it fails on some later architectures. + with unsupported_before(handles[i], None): power_mwatts = nvml.device_get_power_usage(handles[i]) - except nvml.NotSupportedError: - pytest.skip("device_get_power_usage not supported") assert power_mwatts >= 0.0 def test_device_get_total_energy_consumption(ngpus, handles): for i in range(ngpus): - try: + with unsupported_before(handles[i], nvml.DeviceArch.VOLTA): energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i]) - except nvml.NotSupportedError: - pytest.skip("device_get_total_energy_consumption not supported") + for j in range(10): # idle for 150 ms time.sleep(0.015) # and check for increase every 15 ms - energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i]) + with unsupported_before(handles[i], nvml.DeviceArch.VOLTA): + energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i]) assert energy_mjoules2 >= energy_mjoules1 if energy_mjoules2 > energy_mjoules1: break @@ -182,7 +185,8 @@ def test_device_get_memory_info(ngpus, handles): def test_device_get_utilization_rates(ngpus, handles): for i in range(ngpus): - urate = nvml.device_get_utilization_rates(handles[i]) + with unsupported_before(handles[i], "FERMI"): + urate = nvml.device_get_utilization_rates(handles[i]) assert urate.gpu >= 0 assert urate.memory >= 0 @@ -239,7 +243,8 @@ def test_device_get_utilization_rates(ngpus, handles): def test_device_get_pcie_throughput(ngpus, handles): for i in range(ngpus): - tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES) + with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL): + tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES) assert tx_bytes_tp >= 0 rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES) assert rx_bytes_tp >= 0 @@ -271,10 +276,10 @@ def test_device_get_pcie_throughput(ngpus, handles): def test_device_get_nvlink_capability(ngpus, handles, cap_type): for i in range(ngpus): for j in range(nvml.NVLINK_MAX_LINKS): - try: + # By the documentation, this should be supported on PASCAL or newer, + # but this also seems to fail on newer. + with unsupported_before(handles[i], None): cap = nvml.device_get_nvlink_capability(handles[i], j, cap_type) - except nvml.NotSupportedError: - pytest.skip("NVLink capability not supported") assert cap >= 0 diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 71cb35b907..5a5ed425ef 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -21,6 +21,7 @@ ClocksEventReasons = nvml.ClocksEventReasons ClockType = nvml.ClockType CoolerControl = nvml.CoolerControl CoolerTarget = nvml.CoolerTarget +DeviceArch = nvml.DeviceArch EventType = nvml.EventType FanControlPolicy = nvml.FanControlPolicy FieldId = nvml.FieldId @@ -45,41 +46,6 @@ include "_performance.pxi" include "_temperature.pxi" -class DeviceArchitecture: - """ - Device architecture enumeration. - """ - - def __init__(self, architecture: int): - try: - self._architecture = nvml.DeviceArch(architecture) - except ValueError: - self._architecture = None - - @property - def id(self) -> int: - """ - The numeric id of the device architecture. - - Returns -1 if the device is unknown. - """ - if self._architecture is None: - return -1 - return int(self._architecture) - - @property - def name(self) -> str: - """ - The name of the device architecture. - - Returns "Unlisted" if the device is unknown. - """ - if self._architecture is None: - return "Unlisted" - name = self._architecture.name - return name[name.rfind("_") + 1 :].title() - - cdef class MemoryInfo: """ Memory allocation information for a device. @@ -952,16 +918,15 @@ cdef class Device: return [Pstates(x) for x in nvml.device_get_supported_performance_states(self._handle)] @property - def architecture(self) -> DeviceArchitecture: + def arch(self) -> DeviceArch: """ - Device architecture. For example, a Tesla V100 will report - ``DeviceArchitecture.name == "Volta"``, and RTX A6000 will report - ``DeviceArchitecture.name == "Ampere"``. If the device returns an - architecture that is unknown to NVML then ``DeviceArchitecture.name == - "Unknown"`` is reported, whereas an architecture that is unknown to - cuda.core.system is reported as ``DeviceArchitecture.name == "Unlisted"``. + Device architecture. + + For example, a Tesla V100 will report ``DeviceArchitecture.name == + "VOLTA"``, and RTX A6000 will report ``DeviceArchitecture.name == + "AMPERE"``. """ - return DeviceArchitecture(nvml.device_get_architecture(self._handle)) + return DeviceArch(nvml.device_get_architecture(self._handle)) @property def bar1_memory_info(self) -> BAR1MemoryInfo: @@ -1027,6 +992,8 @@ cdef class Device: """ Retrieves the globally unique board serial number associated with this device's board. + + For all products with an InfoROM. """ return nvml.device_get_serial(self._handle) @@ -1268,6 +1235,8 @@ cdef class Device: """ Get the addressing mode of the device. + For Turing &tm; or newer fully supported devices. + Addressing modes can be one of: - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_HMM`: System allocated @@ -1486,7 +1455,7 @@ __all__ = [ "CoolerInfo", "CoolerTarget", "Device", - "DeviceArchitecture", + "DeviceArch", "DeviceAttributes", "DeviceEvents", "EventData", diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 9772b78786..0b8bb51b95 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -103,7 +103,7 @@ CUDA system information and NVIDIA Management Library (NVML) system.CoolerControl system.CoolerInfo system.CoolerTarget - system.DeviceArchitecture + system.DeviceArch system.DeviceAttributes system.DeviceEvents system.EventData diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py index ad2f06bfdb..5f36784112 100644 --- a/cuda_core/tests/system/conftest.py +++ b/cuda_core/tests/system/conftest.py @@ -3,9 +3,48 @@ # SPDX-License-Identifier: Apache-2.0 +from contextlib import contextmanager + import pytest from cuda.core import system skip_if_nvml_unsupported = pytest.mark.skipif( not system.CUDA_BINDINGS_NVML_IS_COMPATIBLE, reason="NVML support requires cuda.bindings version 12.9.6+ or 13.1.2+" ) + + +@contextmanager +def unsupported_before(device: system.Device, expected_device_arch: system.DeviceArch | str | None): + device_arch = device.arch + + if isinstance(expected_device_arch, system.DeviceArch): + expected_device_arch_int = int(expected_device_arch) + elif expected_device_arch == "FERMI": + expected_device_arch_int = 1 + else: + expected_device_arch_int = 0 + + if ( + expected_device_arch is None + or expected_device_arch == "HAS_INFOROM" + or device_arch == system.DeviceArch.UNKNOWN + ): + # In this case, we don't /know/ if it will fail, but we are ok if it + # does or does not. + + # TODO: There are APIs that are documented as supported only if the + # device has an InfoROM, but I couldn't find a way to detect that. For now, they + # are just handled as "possibly failing". + + try: + yield + except system.NotSupportedError: + pytest.skip(f"Unsupported call for device architecture {device_arch.name} on device '{device.name}'") + elif int(device_arch) < expected_device_arch_int: + # In this case, we /know/ if will fail, and we want to assert that it does. + with pytest.raises(system.NotSupportedError): + yield + pytest.skip("Unsupported before {expected_device_arch.name}, got {device_arch.name}") + else: + # In this case, we /know/ it should work, and if it fails, the test should fail. + yield diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 2e762ce860..f1479bde73 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -4,7 +4,7 @@ # ruff: noqa: E402 -from .conftest import skip_if_nvml_unsupported +from .conftest import skip_if_nvml_unsupported, unsupported_before pytestmark = skip_if_nvml_unsupported @@ -12,11 +12,11 @@ import multiprocessing import os import re -import sys import helpers import pytest from cuda.core import system +from cuda.core.system import DeviceArch if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE: from cuda.bindings import _nvml as nvml @@ -35,18 +35,14 @@ def test_device_count(): def test_device_architecture(): for device in system.Device.get_all_devices(): - device_arch = device.architecture - - assert isinstance(device_arch, system.DeviceArchitecture) - if sys.version_info < (3, 12): - assert device_arch.id in nvml.DeviceArch.__members__.values() - else: - assert device_arch.id in nvml.DeviceArch + device_arch = device.arch + assert isinstance(device_arch, system.DeviceArch) def test_device_bar1_memory(): for device in system.Device.get_all_devices(): - bar1_memory_info = device.bar1_memory_info + with unsupported_before(device, DeviceArch.KEPLER): + bar1_memory_info = device.bar1_memory_info free, total, used = ( bar1_memory_info.free, bar1_memory_info.total, @@ -66,54 +62,32 @@ def test_device_bar1_memory(): @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") def test_device_cpu_affinity(): - skip_reasons = set() for device in system.Device.get_all_devices(): - try: + with unsupported_before(device, DeviceArch.KEPLER): affinity = device.get_cpu_affinity(system.AffinityScope.NODE) - except system.NotSupportedError: - skip_reasons.add(f"CPU affinity not supported on '{device.name}'") - else: - assert isinstance(affinity, list) - os.sched_setaffinity(0, affinity) - assert os.sched_getaffinity(0) == set(affinity) - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) + assert isinstance(affinity, list) + os.sched_setaffinity(0, affinity) + assert os.sched_getaffinity(0) == set(affinity) @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") def test_affinity(): - skip_reasons = set() for device in system.Device.get_all_devices(): for scope in (system.AffinityScope.NODE, system.AffinityScope.SOCKET): - try: + with unsupported_before(device, DeviceArch.KEPLER): affinity = device.get_cpu_affinity(scope) - except system.NotSupportedError: - skip_reasons.add(f"CPU affinity not supported on '{device.name}'") - else: - assert isinstance(affinity, list) + assert isinstance(affinity, list) - try: - affinity = device.get_memory_affinity(scope) - except system.NotSupportedError: - skip_reasons.add(f"Memory affinity not supported on '{device.name}'") - else: - assert isinstance(affinity, list) - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) + affinity = device.get_memory_affinity(scope) + assert isinstance(affinity, list) def test_numa_node_id(): - skip_reasons = set() for device in system.Device.get_all_devices(): - try: + with unsupported_before(device, None): numa_node_id = device.numa_node_id - except system.NotSupportedError: - skip_reasons.add(f"NUMA node ID not supported by device '{device.name}'") - else: - assert isinstance(numa_node_id, int) - assert numa_node_id >= -1 - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) + assert isinstance(numa_node_id, int) + assert numa_node_id >= -1 def test_device_cuda_compute_capability(): @@ -210,18 +184,11 @@ def test_device_pci_info(): def test_device_serial(): - skip_reasons = set() for device in system.Device.get_all_devices(): - try: + with unsupported_before(device, "HAS_INFOROM"): serial = device.serial - except system.NotSupportedError: - skip_reasons.add(f"Device serial not supported by device '{device.name}'") - else: - assert isinstance(serial, str) - assert len(serial) > 0 - - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) + assert isinstance(serial, str) + assert len(serial) > 0 def test_device_uuid(): @@ -309,14 +276,9 @@ def test_device_pci_bus_id(): @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") def test_device_attributes(): - skip_reasons = [] - for device in system.Device.get_all_devices(): - try: + with unsupported_before(device, DeviceArch.AMPERE): attributes = device.attributes - except system.NotSupportedError: - skip_reasons.append(f"Device attributes not supported on '{device.name}'") - continue assert isinstance(attributes, system.DeviceAttributes) assert isinstance(attributes.multiprocessor_count, int) @@ -332,21 +294,12 @@ def test_device_attributes(): assert isinstance(attributes.memory_size_mb, int) assert attributes.memory_size_mb > 0 - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) - def test_c2c_mode_enabled(): - skip_reasons = set() for device in system.Device.get_all_devices(): - try: + with unsupported_before(device, None): is_enabled = device.is_c2c_mode_enabled - except nvml.NotSupportedError: - skip_reasons.add(f"C2C mode info not supported on {device}") - else: - assert isinstance(is_enabled, bool) - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) + assert isinstance(is_enabled, bool) @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Persistence mode not supported on WSL or Windows") @@ -365,8 +318,6 @@ def test_persistence_mode_enabled(): def test_field_values(): - skip_reasons = set() - for device in system.Device.get_all_devices(): # TODO: Are there any fields that return double's? It would be good to # test those. @@ -376,12 +327,8 @@ def test_field_values(): system.FieldId.DEV_PCIE_COUNT_TX_BYTES, ] field_values = device.get_field_values(field_ids) - - try: + with unsupported_before(device, None): field_values.validate() - except system.NotSupportedError: - skip_reasons.add(f"Field values {field_ids} not supported on '{device.name}'") - continue with pytest.raises(TypeError): field_values["invalid_index"] @@ -418,20 +365,15 @@ def test_field_values(): assert len(field_values) == 1 assert field_values[0].value <= old_value - if skip_reasons: - pytest.skip(" ; ".join(skip_reasons)) - @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") def test_get_all_devices_with_cpu_affinity(): - try: - for i in range(multiprocessing.cpu_count()): - for device in system.Device.get_all_devices_with_cpu_affinity(i): + for i in range(multiprocessing.cpu_count()): + for device in system.Device.get_all_devices_with_cpu_affinity(i): + with unsupported_before(device, DeviceArch.KEPLER): affinity = device.get_cpu_affinity() - assert isinstance(affinity, list) - assert i in affinity - except system.NotSupportedError: - pytest.skip("Getting devices with CPU affinity not supported") + assert isinstance(affinity, list) + assert i in affinity def test_index(): @@ -450,11 +392,10 @@ def test_module_id(): def test_addressing_mode(): for device in system.Device.get_all_devices(): - try: + # By docs, should be supported on TURING or newer, but experimentally, + # is also unsupported on other hardware. + with unsupported_before(device, None): addressing_mode = device.addressing_mode - except system.NotSupportedError: - pytest.skip(f"Device addressing mode not supported by device '{device.name}'") - continue assert isinstance(addressing_mode, system.AddressingMode) @@ -469,7 +410,8 @@ def test_display_mode(): def test_repair_status(): for device in system.Device.get_all_devices(): - repair_status = device.repair_status + with unsupported_before(device, DeviceArch.AMPERE): + repair_status = device.repair_status assert isinstance(repair_status, system.RepairStatus) assert isinstance(repair_status.channel_repair_pending, bool) @@ -526,7 +468,8 @@ def test_get_minor_number(): def test_get_inforom_version(): for device in system.Device.get_all_devices(): - inforom = device.inforom + with unsupported_before(device, "HAS_INFOROM"): + inforom = device.inforom inforom_image_version = inforom.image_version assert isinstance(inforom_image_version, str) @@ -550,55 +493,35 @@ def test_get_inforom_version(): assert isinstance(duration_us, int) assert duration_us > 0 - try: + with unsupported_before(device, "HAS_INFOROM"): board_part_number = inforom.board_part_number - except system.NotSupportedError: - pass - else: - assert isinstance(board_part_number, str) - assert len(board_part_number) > 0 + assert isinstance(board_part_number, str) + assert len(board_part_number) > 0 inforom.validate() -def test_clock(): +def test_auto_boosted_clocks_enabled(): for device in system.Device.get_all_devices(): - try: + # This API is supported on KEPLER and newer, but it also seems + # unsupported elsewhere. + with unsupported_before(device, None): current, default = device.get_auto_boosted_clocks_enabled() - except system.NotSupportedError: - pass - else: - assert isinstance(current, bool) - assert isinstance(default, bool) + assert isinstance(current, bool) + assert isinstance(default, bool) + +def test_clock(): + for device in system.Device.get_all_devices(): for clock_type in system.ClockType: clock = device.clock(clock_type) assert isinstance(clock, system.ClockInfo) - try: - current_mhz = clock.get_current_mhz() - except system.NotSupportedError: - continue - assert isinstance(current_mhz, int) - assert current_mhz >= 0 - - current_mhz = clock.get_current_mhz(system.ClockId.CURRENT) - assert isinstance(current_mhz, int) - assert current_mhz >= 0 + # These are ordered from oldest API to newest API so we test as much + # as we can on each hardware architecture. - max_mhz = clock.get_max_mhz() - assert isinstance(max_mhz, int) - assert max_mhz >= 0 - - try: - max_customer_boost = clock.get_max_customer_boost_mhz() - except system.NotSupportedError: - pass - else: - assert isinstance(max_customer_boost, int) - assert max_customer_boost >= 0 - - pstate = device.performance_state + with unsupported_before(device, "FERMI"): + pstate = device.performance_state min_, max_ = clock.get_min_max_clock_of_pstate_mhz(pstate) assert isinstance(min_, int) @@ -606,15 +529,30 @@ def test_clock(): assert isinstance(max_, int) assert max_ >= 0 - try: + with unsupported_before(device, "FERMI"): + max_mhz = clock.get_max_mhz() + assert isinstance(max_mhz, int) + assert max_mhz >= 0 + + with unsupported_before(device, DeviceArch.KEPLER): + current_mhz = clock.get_current_mhz() + assert isinstance(current_mhz, int) + assert current_mhz >= 0 + + with unsupported_before(device, DeviceArch.MAXWELL): offsets = clock.get_offsets(pstate) - except system.InvalidArgumentError: - offsets = system.ClockOffsets(nvml.ClockOffset_v1()) assert isinstance(offsets, system.ClockOffsets) assert isinstance(offsets.clock_offset_mhz, int) assert isinstance(offsets.max_offset_mhz, int) assert isinstance(offsets.min_offset_mhz, int) + # By docs, should be supported on PASCAL or newer, but experimentally, + # is also unsupported on other hardware. + with unsupported_before(device, None): + max_customer_boost = clock.get_max_customer_boost_mhz() + assert isinstance(max_customer_boost, int) + assert max_customer_boost >= 0 + def test_clock_event_reasons(): for device in system.Device.get_all_devices(): @@ -627,6 +565,12 @@ def test_clock_event_reasons(): def test_fan(): for device in system.Device.get_all_devices(): + # The fan APIs are only supported on discrete devices with fans, + # but when they are not available `device.num_fans` returns 0. + if device.num_fans == 0: + pytest.skip("Device has no fans to test") + continue + for fan_idx in range(device.num_fans): fan_info = device.fan(fan_idx) assert isinstance(fan_info, system.FanInfo) @@ -662,10 +606,13 @@ def test_fan(): def test_cooler(): for device in system.Device.get_all_devices(): - try: + # The cooler APIs are only supported on discrete devices with fans, + # but when they are not available `device.num_fans` returns 0. + if device.num_fans == 0: + pytest.skip("Device has no coolers to test") + + with unsupported_before(device, DeviceArch.MAXWELL): cooler_info = device.cooler - except system.NotSupportedError: - pytest.skip("CoolerInfo not supported on this device") assert isinstance(cooler_info, system.CoolerInfo) @@ -685,24 +632,21 @@ def test_temperature(): assert isinstance(sensor, int) assert sensor >= 0 - for threshold in list(system.TemperatureThresholds)[:-1]: - try: + # By docs, should be supported on KEPLER or newer, but experimentally, + # is also unsupported on other hardware. + with unsupported_before(device, None): + for threshold in list(system.TemperatureThresholds)[:-1]: t = temperature.threshold(threshold) - except system.NotSupportedError: - continue - else: assert isinstance(t, int) assert t >= 0 - try: + with unsupported_before(device, None): margin = temperature.margin - except system.NotSupportedError: - pass - else: - assert isinstance(margin, int) - assert margin >= 0 + assert isinstance(margin, int) + assert margin >= 0 - thermals = temperature.thermal_settings(system.ThermalTarget.ALL) + with unsupported_before(device, None): + thermals = temperature.thermal_settings(system.ThermalTarget.ALL) assert isinstance(thermals, system.ThermalSettings) for i, sensor in enumerate(thermals): From 70a252daf3521ffd8bbed5dff1fac7ffc553d31b Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 13:28:19 -0500 Subject: [PATCH 2/7] Update cuda_core/tests/system/test_system_device.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cuda_core/tests/system/test_system_device.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index f1479bde73..dff618e12f 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -569,7 +569,6 @@ def test_fan(): # but when they are not available `device.num_fans` returns 0. if device.num_fans == 0: pytest.skip("Device has no fans to test") - continue for fan_idx in range(device.num_fans): fan_info = device.fan(fan_idx) From a9d074b8c728d1e72589577bc74070294285a444 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 13:31:08 -0500 Subject: [PATCH 3/7] Update cuda_core/tests/system/conftest.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cuda_core/tests/system/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py index 5f36784112..369b334869 100644 --- a/cuda_core/tests/system/conftest.py +++ b/cuda_core/tests/system/conftest.py @@ -44,7 +44,7 @@ def unsupported_before(device: system.Device, expected_device_arch: system.Devic # In this case, we /know/ if will fail, and we want to assert that it does. with pytest.raises(system.NotSupportedError): yield - pytest.skip("Unsupported before {expected_device_arch.name}, got {device_arch.name}") + pytest.skip(f"Unsupported before {expected_device_arch.name}, got {device_arch.name}") else: # In this case, we /know/ it should work, and if it fails, the test should fail. yield From c56e49b0f97f0ab4ea882528b576709bd6058fab Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 13:33:04 -0500 Subject: [PATCH 4/7] Update cuda_bindings/tests/nvml/conftest.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cuda_bindings/tests/nvml/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_bindings/tests/nvml/conftest.py b/cuda_bindings/tests/nvml/conftest.py index 7b7a06f847..0c67f02d08 100644 --- a/cuda_bindings/tests/nvml/conftest.py +++ b/cuda_bindings/tests/nvml/conftest.py @@ -161,7 +161,7 @@ def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str # In this case, we /know/ if will fail, and we want to assert that it does. with pytest.raises(nvml.NotSupportedError): yield - pytest.skip("Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}") + pytest.skip(f"Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}") else: # In this case, we /know/ it should work, and if it fails, the test should fail. yield From 08081396e9f96f6f5e5e7c19e10613569641b2ef Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 14:03:29 -0500 Subject: [PATCH 5/7] Fix importing of conftest.py --- cuda_core/tests/system/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/system/conftest.py b/cuda_core/tests/system/conftest.py index 369b334869..51106c35b2 100644 --- a/cuda_core/tests/system/conftest.py +++ b/cuda_core/tests/system/conftest.py @@ -14,7 +14,7 @@ @contextmanager -def unsupported_before(device: system.Device, expected_device_arch: system.DeviceArch | str | None): +def unsupported_before(device, expected_device_arch): device_arch = device.arch if isinstance(expected_device_arch, system.DeviceArch): From 9b571b463346e890e6d63c60281fe7b491f0858d Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 14:17:13 -0500 Subject: [PATCH 6/7] Fix tests --- cuda_core/tests/system/test_system_device.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index dff618e12f..29b9360205 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -16,11 +16,10 @@ import helpers import pytest from cuda.core import system -from cuda.core.system import DeviceArch if system.CUDA_BINDINGS_NVML_IS_COMPATIBLE: from cuda.bindings import _nvml as nvml - from cuda.core.system import _device + from cuda.core.system import DeviceArch, _device @pytest.fixture(autouse=True, scope="module") From 032c4e6f6ef32a4bb06dda67ff777cbc0747752f Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 15:55:09 -0500 Subject: [PATCH 7/7] Fix tests --- cuda_core/tests/system/test_system_device.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 29b9360205..f1ee6f2698 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -276,7 +276,9 @@ def test_device_pci_bus_id(): @pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") def test_device_attributes(): for device in system.Device.get_all_devices(): - with unsupported_before(device, DeviceArch.AMPERE): + # Docs say this should work on AMPERE or newer, but experimentally + # that's not the case. + with unsupported_before(device, None): attributes = device.attributes assert isinstance(attributes, system.DeviceAttributes) @@ -538,12 +540,18 @@ def test_clock(): assert isinstance(current_mhz, int) assert current_mhz >= 0 + # Docs say this should work on PASCAL or newer, but experimentally, + # is also unsupported on other hardware. with unsupported_before(device, DeviceArch.MAXWELL): - offsets = clock.get_offsets(pstate) - assert isinstance(offsets, system.ClockOffsets) - assert isinstance(offsets.clock_offset_mhz, int) - assert isinstance(offsets.max_offset_mhz, int) - assert isinstance(offsets.min_offset_mhz, int) + try: + offsets = clock.get_offsets(pstate) + except system.InvalidArgumentError: + pass + else: + assert isinstance(offsets, system.ClockOffsets) + assert isinstance(offsets.clock_offset_mhz, int) + assert isinstance(offsets.max_offset_mhz, int) + assert isinstance(offsets.min_offset_mhz, int) # By docs, should be supported on PASCAL or newer, but experimentally, # is also unsupported on other hardware.