From 14332cd4ba6404f3248484c3c75e1ae2525b63af Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:05:31 -0500 Subject: [PATCH 01/18] cuda.core.system: More device-related APIs --- cuda_bindings/cuda/bindings/_nvml.pxd | 7 +- cuda_bindings/cuda/bindings/_nvml.pyx | 274 ++++++++++--- cuda_core/cuda/core/system/_device.pyx | 395 ++++++++++++++++++- cuda_core/cuda/core/system/_inforom.pxi | 96 +++++ cuda_core/docs/source/api.rst | 10 + cuda_core/tests/system/test_system_device.py | 175 ++++++++ 6 files changed, 872 insertions(+), 85 deletions(-) create mode 100644 cuda_core/cuda/core/system/_inforom.pxi diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index d08b087b38..a0e6ed9ad9 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -34,7 +34,6 @@ ctypedef nvmlViolationTime_t ViolationTime ctypedef nvmlUUIDValue_t UUIDValue ctypedef nvmlVgpuPlacementList_v1_t VgpuPlacementList_v1 ctypedef nvmlNvLinkPowerThres_t NvLinkPowerThres -ctypedef nvmlSystemEventData_v1_t SystemEventData_v1 ctypedef nvmlGpuInstanceProfileInfo_t GpuInstanceProfileInfo ctypedef nvmlComputeInstanceProfileInfo_t ComputeInstanceProfileInfo ctypedef nvmlMask255_t Mask255 @@ -174,7 +173,7 @@ cpdef str device_get_inforom_version(intptr_t device, int object) cpdef str device_get_inforom_image_version(intptr_t device) cpdef unsigned int device_get_inforom_configuration_checksum(intptr_t device) except? 0 cpdef device_validate_inforom(intptr_t device) -cpdef unsigned long device_get_last_bbx_flush_time(intptr_t device, intptr_t timestamp) except? 0 +cpdef tuple device_get_last_bbx_flush_time(intptr_t device) cpdef int device_get_display_mode(intptr_t device) except? -1 cpdef int device_get_display_active(intptr_t device) except? -1 cpdef int device_get_persistence_mode(intptr_t device) except? -1 @@ -329,10 +328,6 @@ cpdef device_register_events(intptr_t device, unsigned long long event_types, in cpdef unsigned long long device_get_supported_event_types(intptr_t device) except? 0 cpdef object event_set_wait_v2(intptr_t set, unsigned int timeoutms) cpdef event_set_free(intptr_t set) -cpdef system_event_set_create(intptr_t request) -cpdef system_event_set_free(intptr_t request) -cpdef system_register_events(intptr_t request) -cpdef system_event_set_wait(intptr_t request) cpdef device_modify_drain_state(intptr_t pci_info, int new_state) cpdef int device_query_drain_state(intptr_t pci_info) except? -1 cpdef device_remove_gpu_v2(intptr_t pci_info, int gpu_state, int link_state) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 3a3f01ea7a..a14eb8571c 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -10155,11 +10155,157 @@ cdef class EventData: return obj +cdef _get_system_event_data_v1_dtype_offsets(): + cdef nvmlSystemEventData_v1_t pod = nvmlSystemEventData_v1_t() + return _numpy.dtype({ + 'names': ['event_type', 'gpu_id'], + 'formats': [_numpy.uint64, _numpy.uint32], + 'offsets': [ + (&(pod.eventType)) - (&pod), + (&(pod.gpuId)) - (&pod), + ], + 'itemsize': sizeof(nvmlSystemEventData_v1_t), + }) + +system_event_data_v1_dtype = _get_system_event_data_v1_dtype_offsets() + +cdef class SystemEventData_v1: + """Empty-initialize an array of `nvmlSystemEventData_v1_t`. + + The resulting object is of length `size` and of dtype `system_event_data_v1_dtype`. + If default-constructed, the instance represents a single struct. + + Args: + size (int): number of structs, default=1. + + + .. seealso:: `nvmlSystemEventData_v1_t` + """ + cdef: + readonly object _data + + + + def __init__(self, size=1): + arr = _numpy.empty(size, dtype=system_event_data_v1_dtype) + self._data = arr.view(_numpy.recarray) + assert self._data.itemsize == sizeof(nvmlSystemEventData_v1_t), \ + f"itemsize {self._data.itemsize} mismatches struct size { sizeof(nvmlSystemEventData_v1_t) }" + + def __repr__(self): + if self._data.size > 1: + return f"<{__name__}.SystemEventData_v1_Array_{self._data.size} object at {hex(id(self))}>" + else: + return f"<{__name__}.SystemEventData_v1 object at {hex(id(self))}>" + + @property + def ptr(self): + """Get the pointer address to the data as Python :class:`int`.""" + return self._data.ctypes.data + + cdef intptr_t _get_ptr(self): + return self._data.ctypes.data + + def __int__(self): + if self._data.size > 1: + raise TypeError("int() argument must be a bytes-like object of size 1. " + "To get the pointer address of an array, use .ptr") + return self._data.ctypes.data + + def __len__(self): + return self._data.size + + def __eq__(self, other): + cdef object self_data = self._data + if (not isinstance(other, SystemEventData_v1)) or self_data.size != other._data.size or self_data.dtype != other._data.dtype: + return False + return bool((self_data == other._data).all()) + + @property + def event_type(self): + """Union[~_numpy.uint64, int]: Information about what specific system event occurred.""" + if self._data.size == 1: + return int(self._data.event_type[0]) + return self._data.event_type + + @event_type.setter + def event_type(self, val): + self._data.event_type = val + + @property + def gpu_id(self): + """Union[~_numpy.uint32, int]: gpuId in PCI format""" + if self._data.size == 1: + return int(self._data.gpu_id[0]) + return self._data.gpu_id + + @gpu_id.setter + def gpu_id(self, val): + self._data.gpu_id = val + + def __getitem__(self, key): + cdef ssize_t key_ + cdef ssize_t size + if isinstance(key, int): + key_ = key + size = self._data.size + if key_ >= size or key_ <= -(size+1): + raise IndexError("index is out of bounds") + if key_ < 0: + key_ += size + return SystemEventData_v1.from_data(self._data[key_:key_+1]) + out = self._data[key] + if isinstance(out, _numpy.recarray) and out.dtype == system_event_data_v1_dtype: + return SystemEventData_v1.from_data(out) + return out + + def __setitem__(self, key, val): + self._data[key] = val + + @staticmethod + def from_data(data): + """Create an SystemEventData_v1 instance wrapping the given NumPy array. + + Args: + data (_numpy.ndarray): a 1D array of dtype `system_event_data_v1_dtype` holding the data. + """ + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + if not isinstance(data, _numpy.ndarray): + raise TypeError("data argument must be a NumPy ndarray") + if data.ndim != 1: + raise ValueError("data array must be 1D") + if data.dtype != system_event_data_v1_dtype: + raise ValueError("data array must be of dtype system_event_data_v1_dtype") + obj._data = data.view(_numpy.recarray) + + return obj + + @staticmethod + def from_ptr(intptr_t ptr, size_t size=1, bint readonly=False): + """Create an SystemEventData_v1 instance wrapping the given pointer. + + Args: + ptr (intptr_t): pointer address as Python :class:`int` to the data. + size (int): number of structs, default=1. + readonly (bool): whether the data is read-only (to the user). default is `False`. + """ + if ptr == 0: + raise ValueError("ptr must not be null (0)") + cdef SystemEventData_v1 obj = SystemEventData_v1.__new__(SystemEventData_v1) + cdef flag = cpython.buffer.PyBUF_READ if readonly else cpython.buffer.PyBUF_WRITE + cdef object buf = cpython.memoryview.PyMemoryView_FromMemory( + ptr, sizeof(nvmlSystemEventData_v1_t) * size, flag) + data = _numpy.ndarray(size, buffer=buf, dtype=system_event_data_v1_dtype) + obj._data = data.view(_numpy.recarray) + + return obj + + cdef _get_accounting_stats_dtype_offsets(): cdef nvmlAccountingStats_t pod = nvmlAccountingStats_t() return _numpy.dtype({ 'names': ['gpu_utilization', 'memory_utilization', 'max_memory_usage', 'time', 'start_time', 'is_running', 'reserved'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, _numpy.uint32], + 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint64, _numpy.uint64, _numpy.uint64, _numpy.uint32, (_numpy.uint32, 5)], 'offsets': [ (&(pod.gpuUtilization)) - (&pod), (&(pod.memoryUtilization)) - (&pod), @@ -22082,23 +22228,26 @@ cpdef device_validate_inforom(intptr_t device): check_status(__status__) -cpdef unsigned long device_get_last_bbx_flush_time(intptr_t device, intptr_t timestamp) except? 0: +cpdef tuple device_get_last_bbx_flush_time(intptr_t device): """Retrieves the timestamp and the duration of the last flush of the BBX (blackbox) infoROM object during the current run. Args: device (intptr_t): The identifier of the target device. - timestamp (intptr_t): The start timestamp of the last BBX Flush. Returns: - unsigned long: The duration (us) of the last BBX Flush. + A 2-tuple containing: + + - unsigned long long: The start timestamp of the last BBX Flush. + - unsigned long: The duration (us) of the last BBX Flush. .. seealso:: `nvmlDeviceGetLastBBXFlushTime` """ + cdef unsigned long long timestamp cdef unsigned long duration_us with nogil: - __status__ = nvmlDeviceGetLastBBXFlushTime(device, timestamp, &duration_us) + __status__ = nvmlDeviceGetLastBBXFlushTime(device, ×tamp, &duration_us) check_status(__status__) - return duration_us + return (timestamp, duration_us) cpdef int device_get_display_mode(intptr_t device) except? -1: @@ -24913,58 +25062,6 @@ cpdef event_set_free(intptr_t set): check_status(__status__) -cpdef system_event_set_create(intptr_t request): - """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetCreateRequest_t. - - .. seealso:: `nvmlSystemEventSetCreate` - """ - with nogil: - __status__ = nvmlSystemEventSetCreate(request) - check_status(__status__) - - -cpdef system_event_set_free(intptr_t request): - """Releases system event set. - - Args: - request (intptr_t): Reference to nvmlSystemEventSetFreeRequest_t. - - .. seealso:: `nvmlSystemEventSetFree` - """ - with nogil: - __status__ = nvmlSystemEventSetFree(request) - check_status(__status__) - - -cpdef system_register_events(intptr_t request): - """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. - - Args: - request (intptr_t): Reference to the struct nvmlSystemRegisterEventRequest_t. - - .. seealso:: `nvmlSystemRegisterEvents` - """ - with nogil: - __status__ = nvmlSystemRegisterEvents(request) - check_status(__status__) - - -cpdef system_event_set_wait(intptr_t request): - """Waits on system events and delivers events. - - Args: - request (intptr_t): Reference in which to nvmlSystemEventSetWaitRequest_t. - - .. seealso:: `nvmlSystemEventSetWait` - """ - with nogil: - __status__ = nvmlSystemEventSetWait(request) - check_status(__status__) - - cpdef device_modify_drain_state(intptr_t pci_info, int new_state): """Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before this call is made. Must be called as administrator. For Linux only. @@ -27908,3 +28005,64 @@ cpdef object device_get_nvlink_info(intptr_t device): __status__ = nvmlDeviceGetNvLinkInfo(device, info) check_status(__status__) return info_v1_py + + +cpdef intptr_t system_event_set_create(): + """Create an empty set of system events. Event set should be freed by ``nvmlSystemEventSetFree``.""" + cdef nvmlSystemEventSetCreateRequest_v1_t[1] request + with nogil: + request[0].version = sizeof(nvmlSystemEventSetCreateRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetCreate(request) + check_status(__status__) + return (request[0].set) + + +cpdef system_event_set_free(intptr_t event_set): + """Frees an event set.""" + cdef nvmlSystemEventSetFreeRequest_v1_t[1] request + request[0].set = event_set + with nogil: + request[0].version = sizeof(nvmlSystemEventSetFreeRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetFree(request) + check_status(__status__) + + +cpdef system_register_events(unsigned long long event_types, intptr_t event_set): + """Starts recording of events on system and add the events to specified ``nvmlSystemEventSet_t``. + + Args: + event_types (unsigned long long): Bitmask of nvmlSystemEventType_t values representing the events to register. + event_set (intptr_t): The system event set handle. + """ + cdef nvmlSystemRegisterEventRequest_v1_t[1] request + request[0].set = event_set + request[0].eventTypes = event_types + with nogil: + request[0].version = sizeof(nvmlSystemRegisterEventRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemRegisterEvents(request) + check_status(__status__) + + +cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, unsigned int buffer_size): + """Waits for events to occur on the system event set. + + Args: + event_set (intptr_t): The system event set handle. + timeout_ms (unsigned int): The maximum amount of time in milliseconds to wait for an event. + buffer_size (unsigned int): The size of the event buffer. + + Returns: + SystemEvent: The system event that occurred. + """ + cdef nvmlSystemEventSetWaitRequest_v1_t[1] request + cdef SystemEventData_v1 event_data = SystemEventData_v1(buffer_size) + request[0].timeoutms = timeout_ms + request[0].set = event_set + request[0].data = (event_data._get_ptr()) + request[0].dataSize = buffer_size + with nogil: + request[0].version = sizeof(nvmlSystemEventSetWaitRequest_v1_t) | (1 << 24) + __status__ = nvmlSystemEventSetWait(request) + check_status(__status__) + event_data._data.resize((request[0].numEvent,)) + return event_data diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 2371c09c30..7d647f61aa 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -13,10 +13,17 @@ from cuda.bindings import _nvml as nvml from ._nvml_context cimport initialize include "_device_utils.pxi" +include "_inforom.pxi" +AddressingMode = nvml.DeviceAddressingModeType BrandType = nvml.BrandType FieldId = nvml.FieldId +GpuP2PCapsIndex = nvml.GpuP2PCapsIndex +GpuP2PStatus = nvml.GpuP2PStatus +GpuTopologyLevel = nvml.GpuTopologyLevel +InforomObject = nvml.InforomObject +PcieUtilCounter = nvml.PcieUtilCounter class DeviceArchitecture: @@ -127,52 +134,146 @@ cdef class PciInfo: """ PCI information about a GPU device. """ - cdef object _pci_info - def __init__(self, pci_info: nvml.PciInfo): - self._pci_info = pci_info + cdef object _pci_info_ext + cdef intptr_t _handle + + def __init__(self, pci_info_ext: nvml.PciInfoExt_v1, handle: int): + self._pci_info_ext = pci_info_ext + self._handle = handle @property def bus(self) -> int: """ The bus on which the device resides, 0 to 255 """ - return self._pci_info.bus + return self._pci_info_ext.bus @property def bus_id(self) -> str: """ The tuple domain:bus:device.function PCI identifier string """ - return self._pci_info.bus_id + return self._pci_info_ext.bus_id @property def device(self) -> int: """ The device's id on the bus, 0 to 31 """ - return self._pci_info.device_ + return self._pci_info_ext.device_ @property def domain(self) -> int: """ The PCI domain on which the device's bus resides, 0 to 0xffffffff """ - return self._pci_info.domain + return self._pci_info_ext.domain @property def vendor_id(self) -> int: """ The PCI vendor id of the device """ - return self._pci_info.pci_device_id & 0xFFFF + return self._pci_info_ext.pci_device_id & 0xFFFF @property def device_id(self) -> int: """ The PCI device id of the device """ - return self._pci_info.pci_device_id >> 16 + return self._pci_info_ext.pci_device_id >> 16 + + @property + def subsystem_id(self) -> int: + """ + The subsystem device ID + """ + return self._pci_info_ext.pci_sub_system_id + + @property + def base_class(self) -> int: + """ + The 8-bit PCI base class code + """ + return self._pci_info_ext.base_class + + @property + def sub_class(self) -> int: + """ + The 8-bit PCI sub class code + """ + return self._pci_info_ext.sub_class + + def get_max_pcie_link_generation(self) -> int: + """ + Retrieves the maximum PCIe link generation possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a generation 2 PCIe device attached to a generation 1 + PCIe bus, the max link generation this function will report is + generation 1. + """ + return nvml.device_get_max_pcie_link_generation(self._handle) + + def get_gpu_max_pcie_link_generation(self) -> int: + """ + Retrieves the maximum PCIe link generation supported by this GPU device. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_gpu_max_pcie_link_generation(self._handle) + + def get_max_pcie_link_width(self) -> int: + """ + Retrieves the maximum PCIe link width possible with this device and system. + + For Fermi™ or newer fully supported devices. + + For example, for a device with a 16x PCIe bus width attached to a 8x + PCIe system bus this function will report + a max link width of 8. + """ + return nvml.device_get_max_pcie_link_width(self._handle) + + def get_current_pcie_link_generation(self) -> int: + """ + Retrieves the current PCIe link generation. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_curr_pcie_link_generation(self._handle) + + def get_current_pcie_link_width(self) -> int: + """ + Retreives the current PCIe link width. + + For Fermi™ or newer fully supported devices. + """ + return nvml.device_get_curr_pcie_link_width(self._handle) + + def get_pcie_throughput(self, counter: PcieUtilCounter) -> int: + """ + Retrieve PCIe utilization information, in KB/s. + + This function is querying a byte counter over a 20ms interval, and thus + is the PCIe throughput over that interval. + + For Maxwell™ or newer fully supported devices. + + This method is not supported in virtual machines running virtual GPU + (vGPU). + """ + return nvml.device_get_pcie_throughput(self._handle, counter) + + def get_pcie_replay_counter(self) -> int: + """ + Retrieve the PCIe replay counter. + + For Kepler™ or newer fully supported devices. + """ + return nvml.device_get_pcie_replay_counter(self._handle) cdef class DeviceAttributes: @@ -381,6 +482,30 @@ cdef class FieldValues: return [x.value for x in self] +cdef class RepairStatus: + """ + Repair status for TPC/Channel repair. + """ + cdef object _repair_status + + def __init__(self, handle: int): + self._repair_status = nvml.device_get_repair_status(handle) + + @property + def channel_repair_pending(self) -> bool: + """ + `True` if a channel repair is pending. + """ + return bool(self._repair_status.b_channel_repair_pending) + + @property + def tpc_repair_pending(self) -> bool: + """ + `True` if a TPC repair is pending. + """ + return bool(self._repair_status.b_tpc_repair_pending) + + cdef class Device: """ Representation of a device. @@ -416,16 +541,23 @@ cdef class Device: cdef intptr_t _handle - def __init__(self, index: int | None = None, uuid: bytes | str | None = None, pci_bus_id: bytes | str | None = None): - initialize() - - args = [index, uuid, pci_bus_id] + def __init__( + self, + *, + index: int | None = None, + uuid: bytes | str | None = None, + pci_bus_id: bytes | str | None = None, + handle: int | None = None + ): + args = [index, uuid, pci_bus_id, handle] arg_count = sum(x is not None for x in args) if arg_count > 1: - raise ValueError("Handle requires only one of either device `index`, `uuid` or `pci_bus_id`.") + raise ValueError("Handle requires only one of either device `index`, `uuid`, `pci_bus_id` or `handle`.") if arg_count == 0: - raise ValueError("Handle requires either a device `index`, `uuid` or `pci_bus_id`.") + raise ValueError("Handle requires either a device `index`, `uuid`, `pci_bus_id` or `handle`.") + + initialize() if index is not None: self._handle = nvml.device_get_handle_by_index_v2(index) @@ -437,8 +569,20 @@ cdef class Device: if isinstance(pci_bus_id, bytes): pci_bus_id = pci_bus_id.decode("ascii") self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) - else: - raise ValueError("Error parsing arguments") + elif handle is not None: + self._handle = handle + + @classmethod + def get_device_count(cls) -> int: + """ + Get the number of available devices. + + Returns + ------- + int + The number of available devices. + """ + return nvml.device_get_count_v2() @classmethod def get_all_devices(cls) -> Iterable[Device]: @@ -450,9 +594,28 @@ cdef class Device: Iterator of Device An iterator over available devices. """ - total = nvml.device_get_count_v2() - for device_id in range(total): - yield cls(device_id) + for device_id in range(nvml.device_get_count_v2()): + yield cls(index=device_id) + + @classmethod + def get_all_devices_with_cpu_affinity(cls, cpu_index: int) -> Iterable[Device]: + """ + Retrieve the set of GPUs that have a CPU affinity with the given CPU number. + + Supported on Linux only. + + Parameters + ---------- + cpu_index: int + The CPU index. + + Returns + ------- + Iterator of Device + An iterator over available devices. + """ + for handle in nvml.system_get_topology_gpu_set(cpu_index): + yield cls(handle=handle) @property def architecture(self) -> DeviceArchitecture: @@ -539,7 +702,7 @@ cdef class Device: """ The PCI attributes of this device. """ - return PciInfo(nvml.device_get_pci_info_v3(self._handle)) + return PciInfo(nvml.device_get_pci_info_ext(self._handle), self._handle) @property def serial(self) -> str: @@ -559,6 +722,133 @@ cdef class Device: return nvml.device_get_uuid(self._handle) @property + def index(self) -> int: + """ + The NVML index of this device. + + Valid indices are derived from the count returned by + :meth:`Device.get_device_count`. For example, if ``get_device_count()`` + returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU + 1. + + The order in which NVML enumerates devices has no guarantees of + consistency between reboots. For that reason, it is recommended that + devices be looked up by their PCI ids or GPU UUID. + + Note: The NVML index may not correlate with other APIs, such as the CUDA + device index. + """ + return nvml.device_get_index(self._handle) + + @property + def module_id(self) -> int: + """ + Get a unique identifier for the device module on the baseboard. + + This API retrieves a unique identifier for each GPU module that exists + on a given baseboard. For non-baseboard products, this ID would always + be 0. + """ + return nvml.device_get_module_id(self._handle) + + @property + def minor_number(self) -> int: + """ + The minor number of this device. + + For Linux only. + + The minor number is used by the Linux device driver to identify the + device node in ``/dev/nvidiaX``. + """ + return nvml.device_get_minor_number(self._handle) + + @property + def board_part_number(self) -> str: + """ + Retrieves the the device board part number which is programmed into the board's InfoROM. + """ + return nvml.device_get_board_part_number(self._handle) + + @property + def addressing_mode(self) -> AddressingMode: + """ + Get the addressing mode of the device. + + Addressing modes can be one of: + + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_HMM`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + software-based mirroring of the CPU's page tables, on the GPU. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_ATS`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + Address Translation Services. This means that there is (effectively) a + single set of page tables, and the CPU and GPU both use them. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HHM or ATS + is active. + """ + return AddressingMode(nvml.device_get_addressing_mode(self._handle).value) + + @property + def display_mode(self) -> bool: + """ + The display mode for this device. + + Indicates whether a physical display (e.g. monitor) is currently connected to + any of the device's connectors. + """ + return True if nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def display_active(self) -> bool: + """ + The display active status for this device. + + Indicates whether a display is initialized on the device. For example, + whether X Server is attached to this device and has allocated memory for + the screen. + + Display can be active even when no monitor is physically attached. + """ + return True if nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def repair_status(self) -> RepairStatus: + """ + Get the repair status for TPC/Channel repair. + + For Ampere™ or newer fully supported devices. + """ + return RepairStatus(self._handle) + + @property + def inforom(self) -> InforomInfo: + """ + Accessor for InfoROM information. + + For all products with an InfoROM. + """ + return InforomInfo(self) + + def get_topology_nearest_gpus(self, level: GpuTopologyLevel) -> Iterable[Device]: + """ + Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. + + Supported on Linux only. + + Parameters + ---------- + level: :class:`GpuTopologyLevel` + The topology level. + + Returns + ------- + Iterable of :class:`Device` + The nearest devices at the given topology level. + """ + for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): + yield Device(handle=handle) + def attributes(self) -> DeviceAttributes: """ Get various device attributes. @@ -632,7 +922,61 @@ cdef class Device: nvml.device_clear_field_values(self._handle, field_ids) +def get_topology_common_ancestor(device1: Device, device2: Device) -> GpuTopologyLevel: + """ + Retrieve the common ancestor for two devices. + + For Linux only. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + + Returns + ------- + :class:`GpuTopologyLevel` + The common ancestor level of the two devices. + """ + return GpuTopologyLevel( + nvml.device_get_topology_common_ancestor( + device1._handle, + device2._handle, + ) + ) + + +def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex) -> GpuP2PStatus: + """ + Retrieve the P2P status between two devices. + + Parameters + ---------- + device1: :class:`Device` + The first device. + device2: :class:`Device` + The second device. + index: :class:`GpuP2PCapsIndex` + The P2P capability index being looked for between ``device1`` and ``device2``. + + Returns + ------- + :class:`GpuP2PStatus` + The P2P status between the two devices. + """ + return GpuP2PStatus( + nvml.device_get_p2p_status( + device1._handle, + device2._handle, + index, + ) + ) + + __all__ = [ + "AddressingMode", "BAR1MemoryInfo", "BrandType", "Device", @@ -641,6 +985,15 @@ __all__ = [ "FieldId", "FieldValue", "FieldValues", + "GpuP2PCapsIndex", + "GpuP2PStatus", + "GpuTopologyLevel", + "InforomInfo", + "InforomObject", "MemoryInfo", + "PcieUtilCounter", "PciInfo", + "RepairStatus", + "get_p2p_status", + "get_topology_common_ancestor", ] diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi new file mode 100644 index 0000000000..1b2e9325c0 --- /dev/null +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class InforomInfo: + cdef Device _device + + def __init__(self, device: Device): + self._device = device + + def get_version(self, inforom: InforomObject) -> str: + """ + Retrieves the InfoROM version for a given InfoROM object. + + For all products with an InfoROM. + + Fermi™ and higher parts have non-volatile on-board memory for persisting + device info, such as aggregate ECC counts. + + Parameters + ---------- + inforom: :class:`InforomObject` + The InfoROM object to query. + + Returns + ------- + str + The InfoROM version. + """ + return nvml.device_get_inforom_version(self._device._handle, inforom) + + @property + def image_version(self) -> str: + """ + Retrieves the global InfoROM image version. + + For all products with an InfoROM. + + Image version just like VBIOS version uniquely describes the exact + version of the InfoROM flashed on the board in contrast to InfoROM + object version which is only an indicator of supported features. + + Returns + ------- + str + The InfoROM image version. + """ + return nvml.device_get_inforom_image_version(self._device._handle) + + @property + def configuration_checksum(self) -> int: + """ + Retrieves the checksum of the configuration stored in the device's InfoROM. + + For all products with an InfoROM. + + Can be used to make sure that two GPUs have the exact same + configuration. Current checksum takes into account configuration stored + in PWR and ECC InfoROM objects. Checksum can change between driver + releases or when user changes configuration (e.g. disable/enable ECC) + + Returns + ------- + int + The InfoROM checksum. + """ + return nvml.device_get_inforom_configuration_checksum(self._device._handle) + + def validate(self) -> None: + """ + Reads the InfoROM from the flash and verifies the checksums. + + For all products with an InfoROM. + + Raises + ------ + :class:`cuda.core.system.CorruptedInforomError` + If the device's InfoROM is corrupted. + """ + nvml.device_validate_inforom(self._device._handle) + + @property + def bbx_flush_time(self) -> int: + """ + Retrieves the timestamp and duration of the last flush of the BBX (bloackbox) InfoROM object during the current run. + + For all products with an InfoROM. + + Returns + ------- + tuple[int, int] + - timestamp: The start timestamp of the last BBX flush + - duration_us: The duration (in μs) of the last BBX flush + """ + return nvml.device_get_last_bbx_flush_time(self._device._handle) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 15338383f6..14845b3e89 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -79,10 +79,13 @@ CUDA system information and NVIDIA Management Library (NVML) system.get_num_devices system.get_nvml_version system.get_process_name + system.get_topology_common_ancestor + system.get_p2p_status :template: autosummary/cyclass.rst system.Device + system.AddressingMode system.BAR1MemoryInfo system.BrandType system.DeviceArchitecture @@ -90,8 +93,15 @@ CUDA system information and NVIDIA Management Library (NVML) system.FieldId system.FieldValue system.FieldValues + system.GpuP2PCapsIndex + system.GpuP2PStatus + system.GpuTopologyLevel + system.InforomInfo + system.InforomObject system.MemoryInfo + system.PcieUtilCounter system.PciInfo + system.RepairStatus .. module:: cuda.core.utils diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 52c08533ff..093ce33e3a 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -9,6 +9,7 @@ pytestmark = skip_if_nvml_unsupported import array +import multiprocessing import os import re import sys @@ -28,6 +29,10 @@ def check_gpu_available(): pytest.skip("No GPUs available to run device tests", allow_module_level=True) +def test_device_count(): + assert system.Device.get_device_count() == system.get_num_devices() + + def test_device_architecture(): for device in system.Device.get_all_devices(): device_arch = device.architecture @@ -138,6 +143,34 @@ def test_device_pci_info(): assert isinstance(pci_info.device_id, int) assert 0x0000 <= pci_info.device_id <= 0xFFFF + assert isinstance(pci_info.subsystem_id, int) + assert 0x00000000 <= pci_info.subsystem_id <= 0xFFFFFFFF + + assert isinstance(pci_info.base_class, int) + assert 0x00 <= pci_info.base_class <= 0xFF + + assert isinstance(pci_info.sub_class, int) + assert 0x00 <= pci_info.sub_class <= 0xFF + + assert isinstance(pci_info.get_max_pcie_link_generation(), int) + assert 0 <= pci_info.get_max_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_gpu_max_pcie_link_generation(), int) + assert 0 <= pci_info.get_gpu_max_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_max_pcie_link_width(), int) + assert 0 <= pci_info.get_max_pcie_link_width() <= 0xFF + + assert isinstance(pci_info.get_current_pcie_link_generation(), int) + assert 0 <= pci_info.get_current_pcie_link_generation() <= 0xFF + + assert isinstance(pci_info.get_current_pcie_link_width(), int) + assert 0 <= pci_info.get_current_pcie_link_width() <= 0xFF + + assert isinstance(pci_info.get_pcie_throughput(system.PcieUtilCounter.PCIE_UTIL_TX_BYTES), int) + + assert isinstance(pci_info.get_pcie_replay_counter(), int) + def test_device_serial(): skip_reasons = set() @@ -304,3 +337,145 @@ def test_field_values(): field_values.validate() assert len(field_values) == 1 assert field_values[0].value <= old_value + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_all_devices_with_cpu_affinity(): + try: + for i in range(multiprocessing.cpu_count()): + for device in system.Device.get_all_devices_with_cpu_affinity(i): + affinity = device.cpu_affinity + assert isinstance(affinity, list) + assert {i} == set(affinity) + except system.NotSupportedError: + pytest.skip("Getting devices with CPU affinity not supported") + + +def test_index(): + for i, device in enumerate(system.Device.get_all_devices()): + index = device.index + assert isinstance(index, int) + assert index == i + + +def test_module_id(): + for device in system.Device.get_all_devices(): + module_id = device.module_id + assert isinstance(module_id, int) + assert module_id >= 0 + + +def test_addressing_mode(): + for device in system.Device.get_all_devices(): + try: + addressing_mode = device.addressing_mode + except system.NotSupportedError: + pytest.skip(f"Device addressing mode not supported by device '{device.name}'") + continue + assert isinstance(addressing_mode, system.AddressingMode) + + +def test_display_mode(): + for device in system.Device.get_all_devices(): + display_mode = device.display_mode + assert isinstance(display_mode, bool) + + display_active = device.display_active + assert isinstance(display_active, bool) + + +def test_repair_status(): + for device in system.Device.get_all_devices(): + repair_status = device.repair_status + assert isinstance(repair_status, system.RepairStatus) + + assert isinstance(repair_status.channel_repair_pending, bool) + assert isinstance(repair_status.tpc_repair_pending, bool) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_topology_common_ancestor(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + if system.Device.get_device_count() < 2: + pytest.skip("Test requires at least 2 GPUs") + return + + devices = list(system.Device.get_all_devices()) + + ancestor = system.get_topology_common_ancestor(devices[0], devices[1]) + assert isinstance(ancestor, system.GpuTopologyLevel) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_p2p_status(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + if system.Device.get_device_count() < 2: + pytest.skip("Test requires at least 2 GPUs") + return + + devices = list(system.Device.get_all_devices()) + + status = system.get_p2p_status(devices[0], devices[1], system.GpuP2PCapsIndex.P2P_CAPS_INDEX_READ) + assert isinstance(status, system.GpuP2PStatus) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_nearest_gpus(): + # TODO: This is not a great test, and probably doesn't test much of anything + # in practice on our CI. + + for device in system.Device.get_all_devices(): + for near_device in device.get_topology_nearest_gpus(system.GpuTopologyLevel.TOPOLOGY_SINGLE): + assert isinstance(near_device, system.Device) + + +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_get_minor_number(): + for device in system.Device.get_all_devices(): + minor_number = device.minor_number + assert isinstance(minor_number, int) + assert minor_number >= 0 + + +def test_board_part_number(): + for device in system.Device.get_all_devices(): + try: + board_part_number = device.board_part_number + except system.NotSupportedError: + pytest.skip(f"Device board part number not supported by device '{device.name}'") + continue + assert isinstance(board_part_number, str) + assert len(board_part_number) > 0 + + +def test_get_inforom_version(): + for device in system.Device.get_all_devices(): + inforom = device.inforom + + inforom_image_version = inforom.image_version + assert isinstance(inforom_image_version, str) + assert len(inforom_image_version) > 0 + + inforom_version = inforom.get_version(system.InforomObject.INFOROM_OEM) + assert isinstance(inforom_version, str) + assert len(inforom_version) > 0 + + checksum = inforom.configuration_checksum + assert isinstance(checksum, int) + + # TODO: This is untested locally. + try: + timestamp, duration_us = inforom.bbx_flush_time + except system.NotSupportedError: + pass + else: + assert isinstance(timestamp, int) + assert timestamp > 0 + assert isinstance(duration_us, int) + assert duration_us > 0 + + inforom.validate() From 421ebda6ac33c7814bd8a917a69b30546b29abae Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:14:09 -0500 Subject: [PATCH 02/18] Fix line wrapping --- cuda_core/cuda/core/system/_inforom.pxi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi index 1b2e9325c0..f71c92b559 100644 --- a/cuda_core/cuda/core/system/_inforom.pxi +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -83,7 +83,8 @@ cdef class InforomInfo: @property def bbx_flush_time(self) -> int: """ - Retrieves the timestamp and duration of the last flush of the BBX (bloackbox) InfoROM object during the current run. + Retrieves the timestamp and duration of the last flush of the BBX + (bloackbox) InfoROM object during the current run. For all products with an InfoROM. From 16e71e0e1b2a740dc029674197600cf030c14b64 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:18:58 -0500 Subject: [PATCH 03/18] Hide handle as an implementation detail --- cuda_core/cuda/core/system/_device.pyx | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 7d647f61aa..237fea584e 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -547,15 +547,14 @@ cdef class Device: index: int | None = None, uuid: bytes | str | None = None, pci_bus_id: bytes | str | None = None, - handle: int | None = None ): - args = [index, uuid, pci_bus_id, handle] - arg_count = sum(x is not None for x in args) + args = [index, uuid, pci_bus_id] + cdef int arg_count = sum(arg is not None for arg in args) if arg_count > 1: - raise ValueError("Handle requires only one of either device `index`, `uuid`, `pci_bus_id` or `handle`.") + raise ValueError("Handle requires only one of `index`, `uuid`, or `pci_bus_id`.") if arg_count == 0: - raise ValueError("Handle requires either a device `index`, `uuid`, `pci_bus_id` or `handle`.") + raise ValueError("Handle requires either a device `index`, `uuid`, or `pci_bus_id`.") initialize() @@ -569,8 +568,6 @@ cdef class Device: if isinstance(pci_bus_id, bytes): pci_bus_id = pci_bus_id.decode("ascii") self._handle = nvml.device_get_handle_by_pci_bus_id_v2(pci_bus_id) - elif handle is not None: - self._handle = handle @classmethod def get_device_count(cls) -> int: @@ -615,7 +612,9 @@ cdef class Device: An iterator over available devices. """ for handle in nvml.system_get_topology_gpu_set(cpu_index): - yield cls(handle=handle) + device = Device.__new__() + device._handle = handle + return device @property def architecture(self) -> DeviceArchitecture: From aafd1e981a3baf3947c351987ef2f04b25a13518 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:23:39 -0500 Subject: [PATCH 04/18] Update cuda_core/cuda/core/system/_device.pyx Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- cuda_core/cuda/core/system/_device.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 237fea584e..6348aa5f25 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -783,7 +783,7 @@ cdef class Device: memory (``malloc``, ``mmap``) is addressable from the device (GPU), via Address Translation Services. This means that there is (effectively) a single set of page tables, and the CPU and GPU both use them. - - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HHM or ATS + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HMM nor ATS is active. """ return AddressingMode(nvml.device_get_addressing_mode(self._handle).value) From adad2502a01e55476036ba0b8fa4f0fd5f6f0db9 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 12 Jan 2026 16:28:31 -0500 Subject: [PATCH 05/18] Address comments from Copilot --- cuda_core/cuda/core/system/_device.pyx | 17 +++++------------ cuda_core/cuda/core/system/_inforom.pxi | 11 +++++++++-- cuda_core/tests/system/test_system_device.py | 19 ++++++++----------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 6348aa5f25..73edbd4be4 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -207,7 +207,7 @@ cdef class PciInfo: def get_max_pcie_link_generation(self) -> int: """ - Retrieves the maximum PCIe link generation possible with this device and system. + Retrieve the maximum PCIe link generation possible with this device and system. For Fermi™ or newer fully supported devices. @@ -219,7 +219,7 @@ cdef class PciInfo: def get_gpu_max_pcie_link_generation(self) -> int: """ - Retrieves the maximum PCIe link generation supported by this GPU device. + Retrieve the maximum PCIe link generation supported by this GPU device. For Fermi™ or newer fully supported devices. """ @@ -227,7 +227,7 @@ cdef class PciInfo: def get_max_pcie_link_width(self) -> int: """ - Retrieves the maximum PCIe link width possible with this device and system. + Retrieve the maximum PCIe link width possible with this device and system. For Fermi™ or newer fully supported devices. @@ -239,7 +239,7 @@ cdef class PciInfo: def get_current_pcie_link_generation(self) -> int: """ - Retrieves the current PCIe link generation. + Retrieve the current PCIe link generation. For Fermi™ or newer fully supported devices. """ @@ -247,7 +247,7 @@ cdef class PciInfo: def get_current_pcie_link_width(self) -> int: """ - Retreives the current PCIe link width. + Retrieve the current PCIe link width. For Fermi™ or newer fully supported devices. """ @@ -762,13 +762,6 @@ cdef class Device: """ return nvml.device_get_minor_number(self._handle) - @property - def board_part_number(self) -> str: - """ - Retrieves the the device board part number which is programmed into the board's InfoROM. - """ - return nvml.device_get_board_part_number(self._handle) - @property def addressing_mode(self) -> AddressingMode: """ diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi index f71c92b559..c82347ee18 100644 --- a/cuda_core/cuda/core/system/_inforom.pxi +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -81,10 +81,10 @@ cdef class InforomInfo: nvml.device_validate_inforom(self._device._handle) @property - def bbx_flush_time(self) -> int: + def bbx_flush_time(self) -> tuple[int, int]: """ Retrieves the timestamp and duration of the last flush of the BBX - (bloackbox) InfoROM object during the current run. + (blackbox) InfoROM object during the current run. For all products with an InfoROM. @@ -95,3 +95,10 @@ cdef class InforomInfo: - duration_us: The duration (in μs) of the last BBX flush """ return nvml.device_get_last_bbx_flush_time(self._device._handle) + + @property + def board_part_number(self) -> str: + """ + The device board part number which is programmed into the board's InfoROM. + """ + return nvml.device_get_board_part_number(self._device._handle) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 093ce33e3a..61ba2a2296 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -441,17 +441,6 @@ def test_get_minor_number(): assert minor_number >= 0 -def test_board_part_number(): - for device in system.Device.get_all_devices(): - try: - board_part_number = device.board_part_number - except system.NotSupportedError: - pytest.skip(f"Device board part number not supported by device '{device.name}'") - continue - assert isinstance(board_part_number, str) - assert len(board_part_number) > 0 - - def test_get_inforom_version(): for device in system.Device.get_all_devices(): inforom = device.inforom @@ -478,4 +467,12 @@ def test_get_inforom_version(): assert isinstance(duration_us, int) assert duration_us > 0 + try: + board_part_number = inforom.board_part_number + except system.NotSupportedError: + pass + else: + assert isinstance(board_part_number, str) + assert len(board_part_number) > 0 + inforom.validate() From eb088208d025dfad85c5311285debfef7e59d032 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 09:23:50 -0500 Subject: [PATCH 06/18] Working on tests --- cuda_bindings/cuda/bindings/_nvml.pyx | 20 ++++++++++---------- cuda_core/tests/system/test_system_device.py | 3 ++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index a14eb8571c..dbb87e8d0b 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -27104,8 +27104,8 @@ cpdef object system_get_topology_gpu_set(unsigned int cpuNumber): __status__ = nvmlSystemGetTopologyGpuSet(cpuNumber, count, NULL) check_status_size(__status__) if count[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlSystemGetTopologyGpuSet(cpuNumber, count, deviceArray.data) check_status(__status__) @@ -27144,8 +27144,8 @@ cpdef object unit_get_devices(intptr_t unit): __status__ = nvmlUnitGetDevices(unit, deviceCount, NULL) check_status_size(__status__) if deviceCount[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlUnitGetDevices(unit, deviceCount, deviceArray.data) check_status(__status__) @@ -27172,8 +27172,8 @@ cpdef object device_get_topology_nearest_gpus(intptr_t device, unsigned int leve ) check_status_size(__status__) if count[0] == 0: - return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] + cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetTopologyNearestGpus( device, @@ -27837,9 +27837,9 @@ cpdef object device_get_gpu_instances(intptr_t device, unsigned int profile_id): check_status_size(__status__) if count[0] == 0: - view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] + view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array gpuInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + cdef view.array gpuInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetGpuInstances(device, profile_id, gpuInstances.data, count) check_status(__status__) @@ -27863,9 +27863,9 @@ cpdef object gpu_instance_get_compute_instances(intptr_t gpu_instance, unsigned check_status_size(__status__) if count[0] == 0: - view.array(shape=(1,), itemsize=sizeof(intptr_t), format="i", mode="c")[:0] + view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array computeInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="i", mode="c") + cdef view.array computeInstances = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlGpuInstanceGetComputeInstances(gpu_instance, profile_id, computeInstances.data, count) check_status(__status__) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 61ba2a2296..d76ffa3631 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -247,6 +247,7 @@ def test_device_attributes(): except system.NotSupportedError: skip_reasons.append(f"Device attributes not supported on '{device.name}'") continue + print("Attribute type:", type(attributes)) assert isinstance(attributes, system.DeviceAttributes) assert isinstance(attributes.multiprocessor_count, int) @@ -459,7 +460,7 @@ def test_get_inforom_version(): # TODO: This is untested locally. try: timestamp, duration_us = inforom.bbx_flush_time - except system.NotSupportedError: + except (system.NotSupportedError, system.NotReadyError): pass else: assert isinstance(timestamp, int) From 51496f74cc66fff2b1a7001f07bc7d5159b2f48e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 13:22:18 -0500 Subject: [PATCH 07/18] Fix tests --- cuda_core/cuda/core/system/_device.pyx | 3 ++- cuda_core/tests/system/test_system_device.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 73edbd4be4..0e3247cbc6 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -612,7 +612,7 @@ cdef class Device: An iterator over available devices. """ for handle in nvml.system_get_topology_gpu_set(cpu_index): - device = Device.__new__() + device = Device.__new__(Device) device._handle = handle return device @@ -841,6 +841,7 @@ cdef class Device: for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): yield Device(handle=handle) + @property def attributes(self) -> DeviceAttributes: """ Get various device attributes. diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index d76ffa3631..6df72455d8 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -247,7 +247,6 @@ def test_device_attributes(): except system.NotSupportedError: skip_reasons.append(f"Device attributes not supported on '{device.name}'") continue - print("Attribute type:", type(attributes)) assert isinstance(attributes, system.DeviceAttributes) assert isinstance(attributes.multiprocessor_count, int) From 47a83ce756a96cd343a7cb53748e83790c049b0e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 13:57:50 -0500 Subject: [PATCH 08/18] Fix creating new device --- cuda_core/cuda/core/system/_device.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 0e3247cbc6..6df92b4ba8 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -611,6 +611,7 @@ cdef class Device: Iterator of Device An iterator over available devices. """ + cdef Device device for handle in nvml.system_get_topology_gpu_set(cpu_index): device = Device.__new__(Device) device._handle = handle From 3cfdabba42d14f1bb78d8109160a7492f94779ca Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 14:31:32 -0500 Subject: [PATCH 09/18] Fix iterator --- cuda_core/cuda/core/system/_device.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 6df92b4ba8..3434a2ab32 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -615,7 +615,7 @@ cdef class Device: for handle in nvml.system_get_topology_gpu_set(cpu_index): device = Device.__new__(Device) device._handle = handle - return device + yield device @property def architecture(self) -> DeviceArchitecture: From dd3118a953d9cf282e2fd64d544e6856b506990e Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 15:07:58 -0500 Subject: [PATCH 10/18] Fix affinity test --- cuda_core/tests/system/test_system_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 6df72455d8..2c6788ff45 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -346,7 +346,7 @@ def test_get_all_devices_with_cpu_affinity(): for device in system.Device.get_all_devices_with_cpu_affinity(i): affinity = device.cpu_affinity assert isinstance(affinity, list) - assert {i} == set(affinity) + assert i in affinity except system.NotSupportedError: pytest.skip("Getting devices with CPU affinity not supported") From 229366074a45ff6415e834dae013ec2911d407b3 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 13 Jan 2026 18:19:15 -0500 Subject: [PATCH 11/18] Fix nearest GPUs --- cuda_core/cuda/core/system/_device.pyx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 3434a2ab32..856c840530 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -839,8 +839,11 @@ cdef class Device: Iterable of :class:`Device` The nearest devices at the given topology level. """ + cdef Device device for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): - yield Device(handle=handle) + device = Device.__new__(Device) + device._handle = handle + yield device @property def attributes(self) -> DeviceAttributes: From d71888cb50d36d1959f4f195797a54ec5b5b518d Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 14 Jan 2026 17:24:42 -0500 Subject: [PATCH 12/18] cuda.core.system: affinity, clock, fan, temperature and thermals --- cuda_bindings/cuda/bindings/_nvml.pxd | 5 +- cuda_bindings/cuda/bindings/_nvml.pyx | 414 +++---------------- cuda_core/cuda/core/system/_clock.pxi | 130 ++++++ cuda_core/cuda/core/system/_cooler.pxi | 31 ++ cuda_core/cuda/core/system/_device.pyx | 244 ++++++++++- cuda_core/cuda/core/system/_fan.pxi | 103 +++++ cuda_core/cuda/core/system/_inforom.pxi | 3 + cuda_core/cuda/core/system/_performance.pxi | 72 ++++ cuda_core/cuda/core/system/_system.pyx | 2 +- cuda_core/cuda/core/system/_temperature.pxi | 138 +++++++ cuda_core/docs/source/api.rst | 20 + cuda_core/tests/system/test_system_device.py | 211 +++++++++- 12 files changed, 998 insertions(+), 375 deletions(-) create mode 100644 cuda_core/cuda/core/system/_clock.pxi create mode 100644 cuda_core/cuda/core/system/_cooler.pxi create mode 100644 cuda_core/cuda/core/system/_fan.pxi create mode 100644 cuda_core/cuda/core/system/_performance.pxi create mode 100644 cuda_core/cuda/core/system/_temperature.pxi diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index a0e6ed9ad9..4dd1c728a2 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -14,6 +14,8 @@ from .cy_nvml cimport * ############################################################################### ctypedef nvmlDramEncryptionInfo_v1_t DramEncryptionInfo_v1 +ctypedef nvmlMarginTemperature_v1_t MarginTemperature_v1 +ctypedef nvmlFanSpeedInfo_v1_t FanSpeedInfo_v1 ctypedef nvmlConfComputeSetKeyRotationThresholdInfo_v1_t ConfComputeSetKeyRotationThresholdInfo_v1 ctypedef nvmlSystemDriverBranchInfo_v1_t SystemDriverBranchInfo_v1 ctypedef nvmlTemperature_v1_t Temperature_v1 @@ -196,14 +198,12 @@ cpdef object device_get_supported_graphics_clocks(intptr_t device, unsigned int cpdef tuple device_get_auto_boosted_clocks_enabled(intptr_t device) cpdef unsigned int device_get_fan_speed(intptr_t device) except? 0 cpdef unsigned int device_get_fan_speed_v2(intptr_t device, unsigned int fan) except? 0 -cpdef object device_get_fan_speed_rpm(intptr_t device) cpdef unsigned int device_get_target_fan_speed(intptr_t device, unsigned int fan) except? 0 cpdef tuple device_get_min_max_fan_speed(intptr_t device) cpdef unsigned int device_get_fan_control_policy_v2(intptr_t device, unsigned int fan) except * cpdef unsigned int device_get_num_fans(intptr_t device) except? 0 cpdef object device_get_cooler_info(intptr_t device) cpdef unsigned int device_get_temperature_threshold(intptr_t device, int threshold_type) except? 0 -cpdef object device_get_margin_temperature(intptr_t device) cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_ind_ex) cpdef int device_get_performance_state(intptr_t device) except? -1 cpdef unsigned long long device_get_current_clocks_event_reasons(intptr_t device) except? 0 @@ -214,7 +214,6 @@ cpdef int device_get_mem_clk_vf_offset(intptr_t device) except? 0 cpdef tuple device_get_min_max_clock_of_p_state(intptr_t device, int type, int pstate) cpdef tuple device_get_gpc_clk_min_max_vf_offset(intptr_t device) cpdef tuple device_get_mem_clk_min_max_vf_offset(intptr_t device) -cpdef object device_get_clock_offsets(intptr_t device) cpdef device_set_clock_offsets(intptr_t device, intptr_t info) cpdef object device_get_performance_modes(intptr_t device) cpdef object device_get_current_clock_freqs(intptr_t device) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index dbb87e8d0b..9169100ab5 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -1361,10 +1361,9 @@ class SystemEventType(_IntEnum): SYSTEM_EVENT_TYPE_GPU_DRIVER_BIND = 0x0000000000000002 -class ClocksEvent(_IntEnum): +class ClocksEventReasons(_IntEnum): CLOCKS_EVENT_REASON_GPU_IDLE = 0x0000000000000001 CLOCKS_EVENT_REASON_APPLICATIONS_CLOCKS_SETTING = 0x0000000000000002 - CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS = 0x0000000000000002 CLOCKS_EVENT_REASON_SW_POWER_CAP = 0x0000000000000004 CLOCKS_THROTTLE_REASON_HW_SLOWDOWN = 0x0000000000000008 CLOCKS_EVENT_REASON_SYNC_BOOST = 0x0000000000000010 @@ -1373,13 +1372,6 @@ class ClocksEvent(_IntEnum): CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE_SLOWDOWN = 0x0000000000000080 CLOCKS_EVENT_REASON_DISPLAY_CLOCK_SETTING = 0x0000000000000100 CLOCKS_EVENT_REASON_NONE = 0x0000000000000000 - CLOCKS_THROTTLE_REASON_GPU_IDLE = 0x0000000000000001 - CLOCKS_THROTTLE_REASON_APPLICATIONS_CLOCKS_SETTING = 0x0000000000002 - CLOCKS_THROTTLE_REASON_SYNC_BOOST = 0x00000000000010 - CLOCKS_THROTTLE_REASON_SW_POWER_CAP = 0x00000000000004 - CLOCKS_THROTTLE_REASON_SW_THERMAL_SLOWDOWN = 0x00000000000020 - CLOCKS_THROTTLE_REASON_DISPLAY_CLOCK_SETTING = 0x00000000000100 - CLOCKS_THROTTLE_REASON_NONE = 0x0000000000000000 class EncoderQuery(_IntEnum): @@ -4295,138 +4287,6 @@ cdef class CoolerInfo_v1: return obj -cdef _get_margin_temperature_v1_dtype_offsets(): - cdef nvmlMarginTemperature_v1_t pod = nvmlMarginTemperature_v1_t() - return _numpy.dtype({ - 'names': ['version', 'margin_temperature'], - 'formats': [_numpy.uint32, _numpy.int32], - 'offsets': [ - (&(pod.version)) - (&pod), - (&(pod.marginTemperature)) - (&pod), - ], - 'itemsize': sizeof(nvmlMarginTemperature_v1_t), - }) - -margin_temperature_v1_dtype = _get_margin_temperature_v1_dtype_offsets() - -cdef class MarginTemperature_v1: - """Empty-initialize an instance of `nvmlMarginTemperature_v1_t`. - - - .. seealso:: `nvmlMarginTemperature_v1_t` - """ - cdef: - nvmlMarginTemperature_v1_t *_ptr - object _owner - bint _owned - bint _readonly - - def __init__(self): - self._ptr = calloc(1, sizeof(nvmlMarginTemperature_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating MarginTemperature_v1") - self._owner = None - self._owned = True - self._readonly = False - - def __dealloc__(self): - cdef nvmlMarginTemperature_v1_t *ptr - if self._owned and self._ptr != NULL: - ptr = self._ptr - self._ptr = NULL - free(ptr) - - def __repr__(self): - return f"<{__name__}.MarginTemperature_v1 object at {hex(id(self))}>" - - @property - def ptr(self): - """Get the pointer address to the data as Python :class:`int`.""" - return (self._ptr) - - cdef intptr_t _get_ptr(self): - return (self._ptr) - - def __int__(self): - return (self._ptr) - - def __eq__(self, other): - cdef MarginTemperature_v1 other_ - if not isinstance(other, MarginTemperature_v1): - return False - other_ = other - return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlMarginTemperature_v1_t)) == 0) - - def __setitem__(self, key, val): - if key == 0 and isinstance(val, _numpy.ndarray): - self._ptr = malloc(sizeof(nvmlMarginTemperature_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating MarginTemperature_v1") - memcpy(self._ptr, val.ctypes.data, sizeof(nvmlMarginTemperature_v1_t)) - self._owner = None - self._owned = True - self._readonly = not val.flags.writeable - else: - setattr(self, key, val) - - @property - def version(self): - """int: The version number of this struct.""" - return self._ptr[0].version - - @version.setter - def version(self, val): - if self._readonly: - raise ValueError("This MarginTemperature_v1 instance is read-only") - self._ptr[0].version = val - - @property - def margin_temperature(self): - """int: The margin temperature value.""" - return self._ptr[0].marginTemperature - - @margin_temperature.setter - def margin_temperature(self, val): - if self._readonly: - raise ValueError("This MarginTemperature_v1 instance is read-only") - self._ptr[0].marginTemperature = val - - @staticmethod - def from_data(data): - """Create an MarginTemperature_v1 instance wrapping the given NumPy array. - - Args: - data (_numpy.ndarray): a single-element array of dtype `margin_temperature_v1_dtype` holding the data. - """ - return __from_data(data, "margin_temperature_v1_dtype", margin_temperature_v1_dtype, MarginTemperature_v1) - - @staticmethod - def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): - """Create an MarginTemperature_v1 instance wrapping the given pointer. - - Args: - ptr (intptr_t): pointer address as Python :class:`int` to the data. - owner (object): The Python object that owns the pointer. If not provided, data will be copied. - readonly (bool): whether the data is read-only (to the user). default is `False`. - """ - if ptr == 0: - raise ValueError("ptr must not be null (0)") - cdef MarginTemperature_v1 obj = MarginTemperature_v1.__new__(MarginTemperature_v1) - if owner is None: - obj._ptr = malloc(sizeof(nvmlMarginTemperature_v1_t)) - if obj._ptr == NULL: - raise MemoryError("Error allocating MarginTemperature_v1") - memcpy((obj._ptr), ptr, sizeof(nvmlMarginTemperature_v1_t)) - obj._owner = None - obj._owned = True - else: - obj._ptr = ptr - obj._owner = owner - obj._owned = False - obj._readonly = readonly - return obj - - cdef _get_clk_mon_fault_info_dtype_offsets(): cdef nvmlClkMonFaultInfo_t pod = nvmlClkMonFaultInfo_t() return _numpy.dtype({ @@ -4753,150 +4613,6 @@ cdef class ClockOffset_v1: return obj -cdef _get_fan_speed_info_v1_dtype_offsets(): - cdef nvmlFanSpeedInfo_v1_t pod = nvmlFanSpeedInfo_v1_t() - return _numpy.dtype({ - 'names': ['version', 'fan', 'speed'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32], - 'offsets': [ - (&(pod.version)) - (&pod), - (&(pod.fan)) - (&pod), - (&(pod.speed)) - (&pod), - ], - 'itemsize': sizeof(nvmlFanSpeedInfo_v1_t), - }) - -fan_speed_info_v1_dtype = _get_fan_speed_info_v1_dtype_offsets() - -cdef class FanSpeedInfo_v1: - """Empty-initialize an instance of `nvmlFanSpeedInfo_v1_t`. - - - .. seealso:: `nvmlFanSpeedInfo_v1_t` - """ - cdef: - nvmlFanSpeedInfo_v1_t *_ptr - object _owner - bint _owned - bint _readonly - - def __init__(self): - self._ptr = calloc(1, sizeof(nvmlFanSpeedInfo_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating FanSpeedInfo_v1") - self._owner = None - self._owned = True - self._readonly = False - - def __dealloc__(self): - cdef nvmlFanSpeedInfo_v1_t *ptr - if self._owned and self._ptr != NULL: - ptr = self._ptr - self._ptr = NULL - free(ptr) - - def __repr__(self): - return f"<{__name__}.FanSpeedInfo_v1 object at {hex(id(self))}>" - - @property - def ptr(self): - """Get the pointer address to the data as Python :class:`int`.""" - return (self._ptr) - - cdef intptr_t _get_ptr(self): - return (self._ptr) - - def __int__(self): - return (self._ptr) - - def __eq__(self, other): - cdef FanSpeedInfo_v1 other_ - if not isinstance(other, FanSpeedInfo_v1): - return False - other_ = other - return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlFanSpeedInfo_v1_t)) == 0) - - def __setitem__(self, key, val): - if key == 0 and isinstance(val, _numpy.ndarray): - self._ptr = malloc(sizeof(nvmlFanSpeedInfo_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating FanSpeedInfo_v1") - memcpy(self._ptr, val.ctypes.data, sizeof(nvmlFanSpeedInfo_v1_t)) - self._owner = None - self._owned = True - self._readonly = not val.flags.writeable - else: - setattr(self, key, val) - - @property - def version(self): - """int: the API version number""" - return self._ptr[0].version - - @version.setter - def version(self, val): - if self._readonly: - raise ValueError("This FanSpeedInfo_v1 instance is read-only") - self._ptr[0].version = val - - @property - def fan(self): - """int: the fan index""" - return self._ptr[0].fan - - @fan.setter - def fan(self, val): - if self._readonly: - raise ValueError("This FanSpeedInfo_v1 instance is read-only") - self._ptr[0].fan = val - - @property - def speed(self): - """int: OUT: the fan speed in RPM.""" - return self._ptr[0].speed - - @speed.setter - def speed(self, val): - if self._readonly: - raise ValueError("This FanSpeedInfo_v1 instance is read-only") - self._ptr[0].speed = val - - @staticmethod - def from_data(data): - """Create an FanSpeedInfo_v1 instance wrapping the given NumPy array. - - Args: - data (_numpy.ndarray): a single-element array of dtype `fan_speed_info_v1_dtype` holding the data. - """ - return __from_data(data, "fan_speed_info_v1_dtype", fan_speed_info_v1_dtype, FanSpeedInfo_v1) - - @staticmethod - def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): - """Create an FanSpeedInfo_v1 instance wrapping the given pointer. - - Args: - ptr (intptr_t): pointer address as Python :class:`int` to the data. - owner (object): The Python object that owns the pointer. If not provided, data will be copied. - readonly (bool): whether the data is read-only (to the user). default is `False`. - """ - if ptr == 0: - raise ValueError("ptr must not be null (0)") - cdef FanSpeedInfo_v1 obj = FanSpeedInfo_v1.__new__(FanSpeedInfo_v1) - if owner is None: - obj._ptr = malloc(sizeof(nvmlFanSpeedInfo_v1_t)) - if obj._ptr == NULL: - raise MemoryError("Error allocating FanSpeedInfo_v1") - memcpy((obj._ptr), ptr, sizeof(nvmlFanSpeedInfo_v1_t)) - obj._owner = None - obj._owned = True - else: - obj._ptr = ptr - obj._owner = owner - obj._owned = False - obj._readonly = readonly - return obj - - cdef _get_device_perf_modes_v1_dtype_offsets(): cdef nvmlDevicePerfModes_v1_t pod = nvmlDevicePerfModes_v1_t() return _numpy.dtype({ @@ -22669,26 +22385,6 @@ cpdef unsigned int device_get_fan_speed_v2(intptr_t device, unsigned int fan) ex return speed -cpdef object device_get_fan_speed_rpm(intptr_t device): - """Retrieves the intended operating speed in rotations per minute (RPM) of the device's specified fan. - - Args: - device (intptr_t): The identifier of the target device. - - Returns: - nvmlFanSpeedInfo_v1_t: Structure specifying the index of the target fan (input) and retrieved fan speed value (output). - - .. seealso:: `nvmlDeviceGetFanSpeedRPM` - """ - cdef FanSpeedInfo_v1 fan_speed_py = FanSpeedInfo_v1() - cdef nvmlFanSpeedInfo_t *fan_speed = (fan_speed_py._get_ptr()) - fan_speed.version = sizeof(nvmlFanSpeedInfo_v1_t) | (1 << 24) - with nogil: - __status__ = nvmlDeviceGetFanSpeedRPM(device, fan_speed) - check_status(__status__) - return fan_speed_py - - cpdef unsigned int device_get_target_fan_speed(intptr_t device, unsigned int fan) except? 0: """Retrieves the intended target speed of the device's specified fan. @@ -22806,26 +22502,6 @@ cpdef unsigned int device_get_temperature_threshold(intptr_t device, int thresho return temp -cpdef object device_get_margin_temperature(intptr_t device): - """Retrieves the thermal margin temperature (distance to nearest slowdown threshold). - - Args: - device (intptr_t): The identifier of the target device. - - Returns: - nvmlMarginTemperature_v1_t: Versioned structure in which to return the temperature reading. - - .. seealso:: `nvmlDeviceGetMarginTemperature` - """ - cdef MarginTemperature_v1 margin_temp_info_py = MarginTemperature_v1() - cdef nvmlMarginTemperature_t *margin_temp_info = (margin_temp_info_py._get_ptr()) - margin_temp_info.version = sizeof(nvmlMarginTemperature_v1_t) | (1 << 24) - with nogil: - __status__ = nvmlDeviceGetMarginTemperature(device, margin_temp_info) - check_status(__status__) - return margin_temp_info_py - - cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_ind_ex): """Used to execute a list of thermal system instructions. @@ -23023,26 +22699,6 @@ cpdef tuple device_get_mem_clk_min_max_vf_offset(intptr_t device): return (min_offset, max_offset) -cpdef object device_get_clock_offsets(intptr_t device): - """Retrieve min, max and current clock offset of some clock domain for a given PState. - - Args: - device (intptr_t): The identifier of the target device. - - Returns: - nvmlClockOffset_v1_t: Structure specifying the clock type (input) and the pstate (input) retrieved clock offset value (output), min clock offset (output) and max clock offset (output). - - .. seealso:: `nvmlDeviceGetClockOffsets` - """ - cdef ClockOffset_v1 info_py = ClockOffset_v1() - cdef nvmlClockOffset_t *info = (info_py._get_ptr()) - info.version = sizeof(nvmlClockOffset_v1_t) | (1 << 24) - with nogil: - __status__ = nvmlDeviceGetClockOffsets(device, info) - check_status(__status__) - return info_py - - cpdef device_set_clock_offsets(intptr_t device, intptr_t info): """Control current clock offset of some clock domain for a given PState. @@ -27206,15 +26862,13 @@ cpdef object device_get_temperature_v(intptr_t device, nvmlTemperatureSensors_t return temperature.temperature -cpdef object device_get_supported_performance_states(intptr_t device, unsigned int size): +cpdef object device_get_supported_performance_states(intptr_t device): """Get all supported Performance States (P-States) for the device. Args: device (Device): The identifier of the target device. - size (unsigned int): The number of states to return. """ - if size == 0: - return view.array(shape=(1,), itemsize=sizeof(unsigned int), format="I", mode="c")[:0] + cdef int size = 16 # NVML_MAX_GPU_PERF_STATES cdef view.array pstates = view.array(shape=(size,), itemsize=sizeof(unsigned int), format="I", mode="c") # The header says "size is the size of the pstates array in bytes". @@ -28066,3 +27720,65 @@ cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, check_status(__status__) event_data._data.resize((request[0].numEvent,)) return event_data + + +cpdef unsigned int device_get_fan_speed_rpm(intptr_t device, unsigned int fan): + """Retrieves the intended operating speed in rotations per minute (RPM) of the device's specified fan. + + Args: + device (intptr_t): The identifier of the target device. + fan (unsigned int): The index of the fan to query. + + Returns: + rpm (unsigned int): The fan speed in RPM. + + .. seealso:: `nvmlDeviceGetFanSpeedRPM` + """ + cdef nvmlFanSpeedInfo_v1_t[1] fan_speed + fan_speed[0].version = sizeof(nvmlFanSpeedInfo_v1_t) | (1 << 24) + fan_speed[0].fan = fan + with nogil: + __status__ = nvmlDeviceGetFanSpeedRPM(device, fan_speed) + check_status(__status__) + return fan_speed[0].speed + + +cpdef int device_get_margin_temperature(intptr_t device): + """Retrieves the thermal margin temperature (distance to nearest slowdown threshold). + + Args: + device (intptr_t): The identifier of the target device. + + Returns: + margin_temperature (int): The margin temperature value. + + .. seealso:: `nvmlDeviceGetMarginTemperature` + """ + cdef nvmlMarginTemperature_v1_t[1] margin_temp_info + margin_temp_info[0].version = sizeof(nvmlMarginTemperature_v1_t) | (1 << 24) + with nogil: + __status__ = nvmlDeviceGetMarginTemperature(device, margin_temp_info) + check_status(__status__) + return margin_temp_info[0].marginTemperature + + +cpdef object device_get_clock_offsets(intptr_t device, nvmlClockType_t clock_type, nvmlPstates_t pstate): + """Retrieve min, max and current clock offset of some clock domain for a given PState. + + Args: + device (intptr_t): The identifier of the target device. + + Returns: + nvmlClockOffset_v1_t: Structure specifying the clock type (input) and the pstate (input) retrieved clock offset value (output), min clock offset (output) and max clock offset (output). + + .. seealso:: `nvmlDeviceGetClockOffsets` + """ + cdef ClockOffset_v1 info_py = ClockOffset_v1() + cdef nvmlClockOffset_v1_t *info = (info_py._get_ptr()) + info.version = sizeof(nvmlClockOffset_v1_t) | (1 << 24) + info.type = clock_type + info.pstate = pstate + with nogil: + __status__ = nvmlDeviceGetClockOffsets(device, info) + check_status(__status__) + return info_py diff --git a/cuda_core/cuda/core/system/_clock.pxi b/cuda_core/cuda/core/system/_clock.pxi new file mode 100644 index 0000000000..911ef4ce72 --- /dev/null +++ b/cuda_core/cuda/core/system/_clock.pxi @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class ClockOffsets: + """ + Contains clock offset information. + """ + + cdef object _clock_offset + + def __init__(self, clock_offset: nvml.ClockOffset): + self._clock_offset = clock_offset + + @property + def clock_offset_mhz(self) -> int: + """ + The current clock offset in MHz. + """ + return self._clock_offset.clock_offset_m_hz + + @property + def max_offset_mhz(self) -> int: + """ + The maximum clock offset in MHz. + """ + return self._clock_offset.max_clock_offset_m_hz + + @property + def min_offset_mhz(self) -> int: + """ + The minimum clock offset in MHz. + """ + return self._clock_offset.min_clock_offset_m_hz + + +cdef class ClockInfo: + """ + Accesses various clock information about a device. + """ + + cdef intptr_t _handle + cdef int _clock_type + + def __init__(self, handle, clock_type: ClockType): + self._handle = handle + self._clock_type = int(clock_type) + + def get_current_mhz(self, clock_id: ClockId = ClockId.CURRENT) -> int: + """ + Get the current clock speed of a specific clock domain, in MHz. + + For Kepler™ or newer fully supported devices. + + Parameters + ---------- + clock_id: :class:`ClockId` + The clock ID to query. + + Returns + ------- + int + The clock speed in MHz. + """ + return nvml.device_get_clock(self._handle, self._clock_type, clock_id) + + def get_max_mhz(self) -> int: + """ + Get the maximum clock speed of a specific clock domain, in MHz. + + For Fermi™ or newer fully supported devices. + + Current P0 clocks (reported by :meth:`get_current_mhz` can differ from + max clocks by a few MHz. + + Returns + ------- + int + The maximum clock speed in MHz. + """ + return nvml.device_get_max_clock_info(self._handle, self._clock_type) + + def get_max_customer_boost_mhz(self) -> int: + """ + Get the maximum customer boost clock speed of a specific clock, in MHz. + + For Pascal™ or newer fully supported devices. + + Returns + ------- + int + The maximum customer boost clock speed in MHz. + """ + return nvml.device_get_max_customer_boost_clock(self._handle, self._clock_type) + + def get_min_max_clock_of_pstate_mhz(self, pstate: Pstates) -> tuple[int, int]: + """ + Get the minimum and maximum clock speeds for this clock domain + at a given performance state (Pstate), in MHz. + + Parameters + ---------- + pstate: :class:`Pstates` + The performance state to query. + + Returns + ------- + tuple[int, int] + A tuple containing the minimum and maximum clock speeds in MHz. + """ + return nvml.device_get_min_max_clock_of_p_state(self._handle, self._clock_type, pstate) + + def get_offsets(self, pstate: Pstates) -> ClockOffsets: + """ + Retrieve min, max and current clock offset of some clock domain for a given Pstate. + + For Maxwell™ or newer fully supported devices. + + Parameters + ---------- + pstate: :class:`Pstates` + The performance state to query. + + Returns + ------- + ClockOffsets + An object with the min, max and current clock offset. + """ + return ClockOffsets(nvml.device_get_clock_offsets(self._handle, self._clock_type, pstate)) diff --git a/cuda_core/cuda/core/system/_cooler.pxi b/cuda_core/cuda/core/system/_cooler.pxi new file mode 100644 index 0000000000..4d49f7ae9e --- /dev/null +++ b/cuda_core/cuda/core/system/_cooler.pxi @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class CoolerInfo: + cdef object _cooler_info + + def __init__(self, cooler_info: nvml.CoolerInfo): + self._cooler_info = cooler_info + + @property + def signal_type(self) -> CoolerControl: + """ + The cooler's control signal characteristics. + + The possible types are restricted, variable and toggle. See + :class:`CoolerControl` for details. + """ + return CoolerControl(self._cooler_info.signal_type) + + @property + def target(self) -> list[CoolerTarget]: + """ + The target that cooler controls. + + Targets may be GPU, Memory, Power Supply, or all of these. See + :class:`CoolerTarget` for details. + """ + cdef uint64_t[1] targets = [self._cooler_info.target] + return [CoolerTarget(1 << ev) for ev in _unpack_bitmask(targets)] diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 856c840530..3d96298141 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -12,18 +12,35 @@ from cuda.bindings import _nvml as nvml from ._nvml_context cimport initialize -include "_device_utils.pxi" -include "_inforom.pxi" - AddressingMode = nvml.DeviceAddressingModeType +AffinityScope = nvml.AffinityScope BrandType = nvml.BrandType +ClockId = nvml.ClockId +ClocksEventReasons = nvml.ClocksEventReasons +ClockType = nvml.ClockType +CoolerControl = nvml.CoolerControl +CoolerTarget = nvml.CoolerTarget FieldId = nvml.FieldId GpuP2PCapsIndex = nvml.GpuP2PCapsIndex GpuP2PStatus = nvml.GpuP2PStatus GpuTopologyLevel = nvml.GpuTopologyLevel InforomObject = nvml.InforomObject PcieUtilCounter = nvml.PcieUtilCounter +Pstates = nvml.Pstates +ThermalController = nvml.ThermalController +ThermalTarget = nvml.ThermalTarget +TemperatureSensors = nvml.TemperatureSensors +TemperatureThresholds = nvml.TemperatureThresholds + + +include "_clock.pxi" +include "_cooler.pxi" +include "_device_utils.pxi" +include "_fan.pxi" +include "_inforom.pxi" +include "_performance.pxi" +include "_temperature.pxi" class DeviceArchitecture: @@ -617,6 +634,187 @@ cdef class Device: device._handle = handle yield device + def get_memory_affinity(self, scope: AffinityScope) -> list[int]: + """ + Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal + memory affinity for the device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + + If requested scope is not applicable to the target topology, the API + will fall back to reporting the memory affinity for the immediate non-I/O + ancestor of the device. + """ + return _unpack_bitmask( + nvml.device_get_memory_affinity( + self._handle, + ceil(cpu_count() / 64), + scope + ) + ) + + def get_cpu_affinity(self, scope: AffinityScope) -> list[int]: + """ + Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal + CPU affinity for the device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + + If requested scope is not applicable to the target topology, the API + will fall back to reporting the memory affinity for the immediate non-I/O + ancestor of the device. + """ + return _unpack_bitmask( + nvml.device_get_cpu_affinity( + self._handle, + ceil(cpu_count() / 64), + scope, + ) + ) + + def set_cpu_affinity(self): + """ + Sets the ideal affinity for the calling thread and device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + """ + nvml.device_set_cpu_affinity(self._handle) + + def clear_cpu_affinity(self): + """ + Clear all affinity bindings for the calling thread. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + """ + nvml.device_clear_cpu_affinity(self._handle) + + @property + def numa_node_id(self) -> int: + """ + The NUMA node of the given GPU device. + + This only applies to platforms where the GPUs are NUMA nodes. + """ + return nvml.device_get_numa_node_id(self._handle) + + def clock(self, clock_type: ClockType) -> ClockInfo: + """ + Get information about and manage a specific clock on a device. + """ + return ClockInfo(self._handle, clock_type) + + def get_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]: + """ + Retrieve the current state of auto boosted clocks on a device. + + For Kepler™ or newer fully supported devices. + + Auto Boosted clocks are enabled by default on some hardware, allowing + the GPU to run at higher clock rates to maximize performance as thermal + limits allow. + + On Pascal™ and newer hardware, Auto Boosted clocks are controlled + through application clocks. Use :meth:`set_application_clocks` and + :methd:`reset_application_clocks` to control Auto Boost behavior. + + Returns + ------- + bool + The current state of Auto Boosted clocks + bool + The default Auto Boosted clocks behavior + + """ + current, default = nvml.device_get_auto_boosted_clocks_enabled(self._handle) + return current == nvml.EnableState.FEATURE_ENABLED, default == nvml.EnableState.FEATURE_ENABLED + + def get_current_clock_event_reasons(self) -> list[ClocksEventReasons]: + """ + Retrieves the current clocks event reasons. + + For all fully supported products. + """ + cdef uint64_t[1] reasons + reasons[0] = nvml.device_get_current_clocks_event_reasons(self._handle) + return [ClocksEventReasons(1 << reason) for reason in _unpack_bitmask(reasons)] + + def get_supported_clock_event_reasons(self) -> list[ClocksEventReasons]: + """ + Retrieves supported clocks event reasons that can be returned by + :meth:`get_current_clock_event_reasons`. + + For all fully supported products. + + This method is not supported in virtual machines running virtual GPU (vGPU). + """ + cdef uint64_t[1] reasons + reasons[0] = nvml.device_get_supported_clocks_event_reasons(self._handle) + return [ClocksEventReasons(1 << reason) for reason in _unpack_bitmask(reasons)] + + def fan(self, fan: int = 0) -> FanInfo: + """ + Get information and manage a specific fan on a device. + """ + if fan < 0 or fan >= self.num_fans: + raise ValueError(f"Fan index {fan} is out of range [0, {self.num_fans})") + return FanInfo(self._handle, fan) + + @property + def num_fans(self) -> int: + """ + The number of fans on the device. + """ + return nvml.device_get_num_fans(self._handle) + + @property + def cooler(self) -> CoolerInfo: + """ + Get information about cooler on a device. + """ + return CoolerInfo(nvml.device_get_cooler_info(self._handle)) + + @property + def temperature(self) -> Temperature: + """ + Get information about temperatures on a device. + """ + return Temperature(self._handle) + + @property + def performance_state(self) -> Pstates: + """ + The current performance state of the device. + + For Fermi™ or newer fully supported devices. + + See :class:`Pstates` for possible performance states. + """ + return Pstates(nvml.device_get_performance_state(self._handle)) + + @property + def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo: + """ + Retrieve performance monitor samples from the associated subdevice. + """ + return GpuDynamicPstatesInfo(nvml.device_get_dynamic_pstates_info(self._handle)) + + def get_supported_pstates(self) -> list[Pstates]: + """ + Get all supported Performance States (P-States) for the device. + + The returned list contains a contiguous list of valid P-States supported by + the device. + """ + return [Pstates(x) for x in nvml.device_get_supported_performance_states(self._handle)] + @property def architecture(self) -> DeviceArchitecture: """ @@ -640,22 +838,6 @@ cdef class Device: """ return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) - @property - def cpu_affinity(self) -> list[int]: - """ - Get a list containing the CPU indices to which the GPU is directly connected. - - Examples - -------- - >>> Device(index=0).cpu_affinity - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59] - """ - return _unpack_bitmask(nvml.device_get_cpu_affinity( - self._handle, - ceil(cpu_count() / 64), - )) - @property def cuda_compute_capability(self) -> tuple[int, int]: """ @@ -974,14 +1156,28 @@ def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex) -> __all__ = [ "AddressingMode", + "AffinityScope", "BAR1MemoryInfo", "BrandType", + "ClockId", + "ClockInfo", + "ClockOffsets", + "ClocksEventReasons", + "ClockType", + "CoolerControl", + "CoolerInfo", + "CoolerTarget", "Device", "DeviceArchitecture", "DeviceAttributes", + "FanInfo", "FieldId", "FieldValue", "FieldValues", + "get_p2p_status", + "get_topology_common_ancestor", + "GpuDynamicPstatesInfo", + "GpuDynamicPstatesUtilization", "GpuP2PCapsIndex", "GpuP2PStatus", "GpuTopologyLevel", @@ -990,7 +1186,13 @@ __all__ = [ "MemoryInfo", "PcieUtilCounter", "PciInfo", + "Pstates", "RepairStatus", - "get_p2p_status", - "get_topology_common_ancestor", + "Temperature", + "TemperatureSensors", + "TemperatureThresholds", + "ThermalController", + "ThermalSensor", + "ThermalSettings", + "ThermalTarget", ] diff --git a/cuda_core/cuda/core/system/_fan.pxi b/cuda_core/cuda/core/system/_fan.pxi new file mode 100644 index 0000000000..6dccc445da --- /dev/null +++ b/cuda_core/cuda/core/system/_fan.pxi @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class FanInfo: + """ + Manages information related to a specific fan on a specific device. + """ + + cdef intptr_t _handle + cdef int _fan + + def __init__(self, handle: int, fan: int): + self._handle = handle + self._fan = fan + + @property + def speed(self) -> int: + """ + Get/set the intended operating speed of the device's fan. + + For all discrete products with dedicated fans. + + Note: The reported speed is the intended fan speed. If the fan is + physically blocked and unable to spin, the output will not match the + actual fan speed. + + The fan speed is expressed as a percentage of the product's maximum + noise tolerance fan speed. This value may exceed 100% in certain cases. + """ + return nvml.device_get_fan_speed_v2(self._handle, self._fan) + + @speed.setter + def speed(self, speed: int): + nvml.device_set_fan_speed_v2(self._handle, self._fan, speed) + + @property + def speed_rpm(self) -> int: + """ + The intended operating speed of the device's fan in rotations per minute + (RPM). + + For Maxwell™ or newer fully supported devices. + + For all discrete products with dedicated fans. + + Note: The reported speed is the intended fan speed. If the fan is + physically blocked and unable to spin, the output will not match the + actual fan speed. + """ + return nvml.device_get_fan_speed_rpm(self._handle, self._fan) + + @property + def target_speed(self) -> int: + """ + Retrieves the intended target speed of the device's specified fan. + + For all discrete products with dedicated fans. + + Normally, the driver dynamically adjusts the fan based on + the needs of the GPU. But when user set fan speed using :property:`speed` + the driver will attempt to make the fan achieve the setting in + :property:`speed`. The actual current speed of the fan + is reported in :property:`speed`. + + The fan speed is expressed as a percentage of the product's maximum + noise tolerance fan speed. This value may exceed 100% in certain cases. + """ + return nvml.device_get_target_fan_speed(self._handle, self._fan) + + @property + def min_max_speed(self) -> tuple[int, int]: + """ + Retrieves the minimum and maximum fan speed all of the device's fans. + + For all discrete products with dedicated fans. + + Returns + ------- + tuple[int, int] + A tuple of (min_speed, max_speed) + """ + return nvml.device_get_min_max_fan_speed(self._handle) + + @property + def control_policy(self) -> FanControlPolicy: + """ + The current fan control policy. + + For Maxwell™ or newer fully supported devices. + + For all CUDA-capable discrete products with fans. + """ + return nvml.device_get_fan_control_policy_v2(self._handle, self._fan) + + def set_default_fan_speed(self): + """ + Set the speed of the fan control policy to default. + + For all CUDA-capable discrete products with fans. + """ + nvml.device_set_default_fan_speed_v2(self._handle, self._fan) diff --git a/cuda_core/cuda/core/system/_inforom.pxi b/cuda_core/cuda/core/system/_inforom.pxi index c82347ee18..6950cc5987 100644 --- a/cuda_core/cuda/core/system/_inforom.pxi +++ b/cuda_core/cuda/core/system/_inforom.pxi @@ -3,6 +3,9 @@ # SPDX-License-Identifier: Apache-2.0 +InforomObject = nvml.InforomObject + + cdef class InforomInfo: cdef Device _device diff --git a/cuda_core/cuda/core/system/_performance.pxi b/cuda_core/cuda/core/system/_performance.pxi new file mode 100644 index 0000000000..f39deefe35 --- /dev/null +++ b/cuda_core/cuda/core/system/_performance.pxi @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +# In cuda.bindings.nvml, this is an anonymous struct inside nvmlGpuDynamicPstatesInfo_t. + + +ctypedef struct _GpuDynamicPstatesUtilization: + unsigned int bIsPresent + unsigned int percentage + unsigned int incThreshold + unsigned int decThreshold + + +cdef class GpuDynamicPstatesUtilization: + cdef: + _GpuDynamicPstatesUtilization *_ptr + object _owner + + def __init__(self, ptr: int, owner: object): + self._ptr = <_GpuDynamicPstatesUtilization *>ptr + self._owner = owner + + @property + def is_present(self) -> bool: + """ + Set if the utilization domain is present on this GPU. + """ + return bool(self._ptr[0].bIsPresent) + + @property + def percentage(self) -> int: + """ + Percentage of time where the domain is considered busy in the last 1-second interval. + """ + return self._ptr[0].percentage + + @property + def inc_threshold(self) -> int: + """ + Utilization threshold that can trigger a perf-increasing P-State change when crossed. + """ + return self._ptr[0].incThreshold + + @property + def dec_threshold(self) -> int: + """ + Utilization threshold that can trigger a perf-decreasing P-State change when crossed. + """ + return self._ptr[0].decThreshold + + +cdef class GpuDynamicPstatesInfo: + """ + Handles performance monitor samples from the device. + """ + cdef object _gpu_dynamic_pstates_info + + def __init__(self, gpu_dynamic_pstates_info: nvml.GpuDynamicPstatesInfo): + self._gpu_dynamic_pstates_info = gpu_dynamic_pstates_info + + def __len__(self): + return nvml.MAX_GPU_UTILIZATIONS + + def __getitem__(self, idx: int) -> GpuDynamicPstatesUtilization: + if idx < 0 or idx >= nvml.MAX_GPU_UTILIZATIONS: + raise IndexError("GPU dynamic P-states index out of range") + return GpuDynamicPstatesUtilization( + self._gpu_dynamic_pstates_info.utilization.ptr + idx * sizeof(_GpuDynamicPstatesUtilization), + self._gpu_dynamic_pstates_info + ) diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx index c29d20dd20..3e15420dc8 100644 --- a/cuda_core/cuda/core/system/_system.pyx +++ b/cuda_core/cuda/core/system/_system.pyx @@ -22,7 +22,7 @@ if CUDA_BINDINGS_NVML_IS_COMPATIBLE: from cuda.bindings import _nvml as nvml # TODO: We need to be even more specific than version numbers for development. # This can be removed once we have a release including everything we need. - for member in ["FieldId"]: + for member in ["FieldId", "ClocksEventReasons"]: if not hasattr(nvml, member): CUDA_BINDINGS_NVML_IS_COMPATIBLE = False break diff --git a/cuda_core/cuda/core/system/_temperature.pxi b/cuda_core/cuda/core/system/_temperature.pxi new file mode 100644 index 0000000000..75f9efae4f --- /dev/null +++ b/cuda_core/cuda/core/system/_temperature.pxi @@ -0,0 +1,138 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +# In cuda.bindings.nvml, this is an anonymous struct inside nvmlThermalSettings_t. + + +ctypedef struct _ThermalSensor: + int controller + int defaultMinTemp + int defaultMaxTemp + int currentTemp + int target + + +cdef class ThermalSensor: + cdef: + _ThermalSensor *_ptr + object _owner + + def __init__(self, ptr: int, owner: object): + self._ptr = <_ThermalSensor *>ptr + self._owner = owner + + @property + def controller(self) -> ThermalController: + return ThermalController(self._ptr[0].controller) + + @property + def default_min_temp(self) -> int: + return self._ptr[0].defaultMinTemp + + @property + def default_max_temp(self) -> int: + return self._ptr[0].defaultMaxTemp + + @property + def current_temp(self) -> int: + return self._ptr[0].currentTemp + + @property + def target(self) -> ThermalTarget: + return ThermalTarget(self._ptr[0].target) + + +cdef class ThermalSettings: + cdef object _thermal_settings + + def __init__(self, thermal_settings: nvml.ThermalSettings): + self._thermal_settings = thermal_settings + + def __len__(self): + # MAX_THERMAL_SENSORS_PER_GPU is 3 + return min(self._thermal_settings.count, 3) + + def __getitem__(self, idx: int) -> nvml.ThermalSensor: + if idx < 0 or idx >= len(self): + raise IndexError("Thermal sensor index out of range") + return ThermalSensor( + self._thermal_settings.sensor.ptr + idx * sizeof(_ThermalSensor), + self._thermal_settings + ) + + +cdef class Temperature: + cdef intptr_t _handle + + def __init__(self, handle: int): + self._handle = handle + + def sensor( + self, + sensor: TemperatureSensors = TemperatureSensors.TEMPERATURE_GPU + ) -> int: + """ + Get the temperature reading from a specific sensor on the device, in + degrees Celsius. + + Parameters + ---------- + sensor: :class:`TemperatureSensors`, optional + The temperature sensor to query. + + Returns + ------- + int + The temperature in degrees Celsius. + """ + return nvml.device_get_temperature_v(self._handle, sensor) + + def threshold(self, threshold_type: TemperatureThresholds) -> int: + """ + Retrieves the temperature threshold for this GPU with the specified + threshold type, in degrees Celsius. + + For Kepler™ or newer fully supported devices. + + See :class:`TemperatureThresholds` for possible threshold types. + + Note: This API is no longer the preferred interface for retrieving the + following temperature thresholds on Ada and later architectures: + ``NVML_TEMPERATURE_THRESHOLD_SHUTDOWN``, + ``NVML_TEMPERATURE_THRESHOLD_SLOWDOWN``, + ``NVML_TEMPERATURE_THRESHOLD_MEM_MAX`` and + ``NVML_TEMPERATURE_THRESHOLD_GPU_MAX``. + + Support for reading these temperature thresholds for Ada and later + architectures would be removed from this API in future releases. Please + use :meth:`get_field_values` with ``NVML_FI_DEV_TEMPERATURE_*`` fields + to retrieve temperature thresholds on these architectures. + """ + return nvml.device_get_temperature_threshold(self._handle, threshold_type) + + @property + def margin(self) -> int: + """ + The thermal margin temperature (distance to nearest slowdown threshold) for the device. + """ + return nvml.device_get_margin_temperature(self._handle) + + def thermal_settings(self, sensor_index: ThermalTarget) -> ThermalSettings: + """ + Used to execute a list of thermal system instructions. + + TODO: The above docstring is from the NVML header, but it doesn't seem to make sense. + + Parameters + ---------- + sensor_index: ThermalTarget + The index of the thermal sensor. + + Returns + ------- + :class:`ThermalSettings` + The thermal settings for the specified sensor. + """ + return ThermalSettings(nvml.device_get_thermal_settings(self._handle, sensor_index)) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 14845b3e89..41a66e94b4 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -86,13 +86,25 @@ CUDA system information and NVIDIA Management Library (NVML) system.Device system.AddressingMode + system.AffinityScope system.BAR1MemoryInfo system.BrandType + system.ClockId + system.ClockInfo + system.ClockOffsets + system.ClocksEventReasons + system.ClockType + system.CoolerControl + system.CoolerInfo + system.CoolerTarget system.DeviceArchitecture system.DeviceAttributes + system.FanInfo system.FieldId system.FieldValue system.FieldValues + system.GpuDynamicPstatesInfo + system.GpuDynamicPstatesUtilization system.GpuP2PCapsIndex system.GpuP2PStatus system.GpuTopologyLevel @@ -101,7 +113,15 @@ CUDA system information and NVIDIA Management Library (NVML) system.MemoryInfo system.PcieUtilCounter system.PciInfo + system.Pstates system.RepairStatus + system.Temperature + system.TemperatureSensors + system.TemperatureThresholds + system.ThermalController + system.ThermalSensor + system.ThermalSettings + system.ThermalTarget .. module:: cuda.core.utils diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 2c6788ff45..bbd4fedf9d 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -64,11 +64,12 @@ def test_device_bar1_memory(): assert free + used == total +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") def test_device_cpu_affinity(): skip_reasons = set() for device in system.Device.get_all_devices(): try: - affinity = device.cpu_affinity + affinity = device.get_cpu_affinity(system.AffinityScope.NODE) except system.NotSupportedError: skip_reasons.add(f"CPU affinity not supported on '{device.name}'") else: @@ -79,6 +80,42 @@ def test_device_cpu_affinity(): pytest.skip(" ; ".join(skip_reasons)) +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_affinity(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + for scope in (system.AffinityScope.NODE, system.AffinityScope.SOCKET): + try: + affinity = device.get_cpu_affinity(scope) + except system.NotSupportedError: + skip_reasons.add(f"CPU affinity not supported on '{device.name}'") + else: + assert isinstance(affinity, list) + + try: + affinity = device.get_memory_affinity(scope) + except system.NotSupportedError: + skip_reasons.add(f"Memory affinity not supported on '{device.name}'") + else: + assert isinstance(affinity, list) + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + +def test_numa_node_id(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + try: + numa_node_id = device.numa_node_id + except system.NotSupportedError: + skip_reasons.add(f"NUMA node ID not supported by device '{device.name}'") + else: + assert isinstance(numa_node_id, int) + assert numa_node_id >= -1 + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + def test_device_cuda_compute_capability(): for device in system.Device.get_all_devices(): cuda_compute_capability = device.cuda_compute_capability @@ -476,3 +513,175 @@ def test_get_inforom_version(): assert len(board_part_number) > 0 inforom.validate() + + +def test_clock(): + for device in system.Device.get_all_devices(): + try: + current, default = device.get_auto_boosted_clocks_enabled() + except system.NotSupportedError: + pass + else: + assert isinstance(current, bool) + assert isinstance(default, bool) + + for clock_type in system.ClockType: + clock = device.clock(clock_type) + assert isinstance(clock, system.ClockInfo) + + for clock_type in system.ClockType: + try: + current_mhz = clock.get_current_mhz() + except system.NotSupportedError: + continue + assert isinstance(current_mhz, int) + assert current_mhz >= 0 + + current_mhz = clock.get_current_mhz(system.ClockId.CURRENT) + assert isinstance(current_mhz, int) + assert current_mhz >= 0 + + max_mhz = clock.get_max_mhz() + assert isinstance(max_mhz, int) + assert max_mhz >= 0 + + try: + max_customer_boost = clock.get_max_customer_boost_mhz() + except system.NotSupportedError: + pass + else: + assert isinstance(max_customer_boost, int) + assert max_customer_boost >= 0 + + pstate = device.performance_state + + min_, max_ = clock.get_min_max_clock_of_pstate_mhz(pstate) + assert isinstance(min_, int) + assert min_ >= 0 + assert isinstance(max_, int) + assert max_ >= 0 + + try: + offsets = clock.get_offsets(pstate) + except system.InvalidArgumentError: + offsets = system.ClockOffsets(nvml.ClockOffset_v1()) + else: + assert isinstance(offsets, system.ClockOffsets) + assert isinstance(offsets.clock_offset_mhz, int) + assert isinstance(offsets.max_offset_mhz, int) + assert isinstance(offsets.min_offset_mhz, int) + + +def test_clock_event_reasons(): + for device in system.Device.get_all_devices(): + reasons = device.get_current_clock_event_reasons() + assert all(isinstance(reason, system.ClocksEventReasons) for reason in reasons) + + reasons = device.get_supported_clock_event_reasons() + assert all(isinstance(reason, system.ClocksEventReasons) for reason in reasons) + + +def test_fan(): + for device in system.Device.get_all_devices(): + for fan_idx in range(device.num_fans): + fan_info = device.fan(fan_idx) + assert isinstance(fan_info, system.FanInfo) + + try: + speed = fan_info.speed + assert isinstance(speed, int) + assert 0 <= speed <= 200 + + fan_info.speed = 50 + fan_info.speed = speed + + speed_rpm = fan_info.speed_rpm + assert isinstance(speed_rpm, int) + assert speed_rpm >= 0 + + target_speed = fan_info.target_speed + assert isinstance(target_speed, int) + assert speed <= target_speed * 2 + + min_, max_ = fan_info.min_max_speed + assert isinstance(min_, int) + assert isinstance(max_, int) + assert min_ <= speed <= max_ + + control_policy = fan_info.control_policy + assert isinstance(control_policy, system.FanControlPolicy) + finally: + fan_info.set_default_fan_speed() + + +def test_cooler(): + for device in system.Device.get_all_devices(): + try: + cooler_info = device.cooler + except system.NotSupportedError: + pytest.skip("CoolerInfo not supported on this device") + + assert isinstance(cooler_info, system.CoolerInfo) + + signal_type = cooler_info.signal_type + assert isinstance(signal_type, system.CoolerSignalType) + + target = cooler_info.target + assert all(isinstance(t, system.CoolerTarget) for t in target) + + +def test_temperature(): + for device in system.Device.get_all_devices(): + temperature = device.temperature + assert isinstance(temperature, system.Temperature) + + sensor = temperature.sensor() + assert isinstance(sensor, int) + assert sensor >= 0 + + for threshold in list(system.TemperatureThresholds)[:-1]: + try: + t = temperature.threshold(threshold) + except system.NotSupportedError: + continue + else: + assert isinstance(t, int) + assert t >= 0 + + margin = temperature.margin + assert isinstance(margin, int) + assert margin >= 0 + + thermals = temperature.thermal_settings(system.ThermalTarget.ALL) + assert isinstance(thermals, system.ThermalSettings) + + for i, sensor in enumerate(thermals): + assert isinstance(sensor, system.ThermalSensor) + assert isinstance(sensor.target, system.ThermalTarget) + assert isinstance(sensor.controller, system.ThermalController) + assert isinstance(sensor.default_min_temp, int) + assert sensor.default_min_temp >= 0 + assert isinstance(sensor.default_max_temp, int) + assert sensor.default_max_temp >= sensor.default_min_temp + assert isinstance(sensor.current_temp, int) + assert sensor.default_min_temp <= sensor.current_temp <= sensor.default_max_temp + + +def test_pstates(): + for device in system.Device.get_all_devices(): + pstate = device.performance_state + assert isinstance(pstate, system.Pstates) + + pstates = device.get_supported_pstates() + assert all(isinstance(p, system.Pstates) for p in pstates) + + dynamic_pstates_info = device.dynamic_pstates_info + assert isinstance(dynamic_pstates_info, system.GpuDynamicPstatesInfo) + + assert len(dynamic_pstates_info) == nvml.MAX_GPU_UTILIZATIONS + + for utilization in dynamic_pstates_info: + assert isinstance(utilization.is_present, bool) + assert isinstance(utilization.percentage, int) + assert isinstance(utilization.inc_threshold, int) + assert isinstance(utilization.dec_threshold, int) From 43ab96ac8bebdcf19189de2f7a3243d30752a546 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 14 Jan 2026 17:48:29 -0500 Subject: [PATCH 13/18] Address comments from Copilot --- cuda_bindings/cuda/bindings/_nvml.pyx | 2 +- cuda_core/cuda/core/system/_device.pyx | 4 +- cuda_core/docs/source/api.rst | 1 + cuda_core/tests/system/test_system_device.py | 80 ++++++++++---------- 4 files changed, 44 insertions(+), 43 deletions(-) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index 9169100ab5..de2d9fafd0 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -26829,7 +26829,7 @@ cpdef object device_get_topology_nearest_gpus(intptr_t device, unsigned int leve check_status_size(__status__) if count[0] == 0: return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") + cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetTopologyNearestGpus( device, diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 3d96298141..7a5c1d8307 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -21,6 +21,7 @@ ClocksEventReasons = nvml.ClocksEventReasons ClockType = nvml.ClockType CoolerControl = nvml.CoolerControl CoolerTarget = nvml.CoolerTarget +FanControlPolicy = nvml.FanControlPolicy FieldId = nvml.FieldId GpuP2PCapsIndex = nvml.GpuP2PCapsIndex GpuP2PStatus = nvml.GpuP2PStatus @@ -723,7 +724,7 @@ cdef class Device: On Pascal™ and newer hardware, Auto Boosted clocks are controlled through application clocks. Use :meth:`set_application_clocks` and - :methd:`reset_application_clocks` to control Auto Boost behavior. + :meth:`reset_application_clocks` to control Auto Boost behavior. Returns ------- @@ -1170,6 +1171,7 @@ __all__ = [ "Device", "DeviceArchitecture", "DeviceAttributes", + "FanControlPolicy", "FanInfo", "FieldId", "FieldValue", diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 41a66e94b4..4ba071143f 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -99,6 +99,7 @@ CUDA system information and NVIDIA Management Library (NVML) system.CoolerTarget system.DeviceArchitecture system.DeviceAttributes + system.FanControlPolicy system.FanInfo system.FieldId system.FieldValue diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index bbd4fedf9d..dedf4fe3d7 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -529,47 +529,45 @@ def test_clock(): clock = device.clock(clock_type) assert isinstance(clock, system.ClockInfo) - for clock_type in system.ClockType: - try: - current_mhz = clock.get_current_mhz() - except system.NotSupportedError: - continue - assert isinstance(current_mhz, int) - assert current_mhz >= 0 - - current_mhz = clock.get_current_mhz(system.ClockId.CURRENT) - assert isinstance(current_mhz, int) - assert current_mhz >= 0 - - max_mhz = clock.get_max_mhz() - assert isinstance(max_mhz, int) - assert max_mhz >= 0 - - try: - max_customer_boost = clock.get_max_customer_boost_mhz() - except system.NotSupportedError: - pass - else: - assert isinstance(max_customer_boost, int) - assert max_customer_boost >= 0 - - pstate = device.performance_state - - min_, max_ = clock.get_min_max_clock_of_pstate_mhz(pstate) - assert isinstance(min_, int) - assert min_ >= 0 - assert isinstance(max_, int) - assert max_ >= 0 + try: + current_mhz = clock.get_current_mhz() + except system.NotSupportedError: + continue + assert isinstance(current_mhz, int) + assert current_mhz >= 0 + + current_mhz = clock.get_current_mhz(system.ClockId.CURRENT) + assert isinstance(current_mhz, int) + assert current_mhz >= 0 + + max_mhz = clock.get_max_mhz() + assert isinstance(max_mhz, int) + assert max_mhz >= 0 - try: - offsets = clock.get_offsets(pstate) - except system.InvalidArgumentError: - offsets = system.ClockOffsets(nvml.ClockOffset_v1()) - else: - assert isinstance(offsets, system.ClockOffsets) - assert isinstance(offsets.clock_offset_mhz, int) - assert isinstance(offsets.max_offset_mhz, int) - assert isinstance(offsets.min_offset_mhz, int) + try: + max_customer_boost = clock.get_max_customer_boost_mhz() + except system.NotSupportedError: + pass + else: + assert isinstance(max_customer_boost, int) + assert max_customer_boost >= 0 + + pstate = device.performance_state + + min_, max_ = clock.get_min_max_clock_of_pstate_mhz(pstate) + assert isinstance(min_, int) + assert min_ >= 0 + assert isinstance(max_, int) + assert max_ >= 0 + + try: + offsets = clock.get_offsets(pstate) + except system.InvalidArgumentError: + offsets = system.ClockOffsets(nvml.ClockOffset_v1()) + assert isinstance(offsets, system.ClockOffsets) + assert isinstance(offsets.clock_offset_mhz, int) + assert isinstance(offsets.max_offset_mhz, int) + assert isinstance(offsets.min_offset_mhz, int) def test_clock_event_reasons(): @@ -624,7 +622,7 @@ def test_cooler(): assert isinstance(cooler_info, system.CoolerInfo) signal_type = cooler_info.signal_type - assert isinstance(signal_type, system.CoolerSignalType) + assert isinstance(signal_type, system.CoolerControl) target = cooler_info.target assert all(isinstance(t, system.CoolerTarget) for t in target) From f96f75f8ed6ee6708fff4130329fcc4026173931 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 15 Jan 2026 09:01:50 -0500 Subject: [PATCH 14/18] Fix tests --- cuda_core/cuda/core/system/_device.pyx | 6 +++--- cuda_core/tests/system/test_system_device.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index 7a5c1d8307..dda95ceb48 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -635,7 +635,7 @@ cdef class Device: device._handle = handle yield device - def get_memory_affinity(self, scope: AffinityScope) -> list[int]: + def get_memory_affinity(self, scope: AffinityScope=AffinityScope.NODE) -> list[int]: """ Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal memory affinity for the device. @@ -656,7 +656,7 @@ cdef class Device: ) ) - def get_cpu_affinity(self, scope: AffinityScope) -> list[int]: + def get_cpu_affinity(self, scope: AffinityScope=AffinityScope.NODE) -> list[int]: """ Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal CPU affinity for the device. @@ -670,7 +670,7 @@ cdef class Device: ancestor of the device. """ return _unpack_bitmask( - nvml.device_get_cpu_affinity( + nvml.device_get_cpu_affinity_within_scope( self._handle, ceil(cpu_count() / 64), scope, diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index dedf4fe3d7..7f6adb9884 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -381,7 +381,7 @@ def test_get_all_devices_with_cpu_affinity(): try: for i in range(multiprocessing.cpu_count()): for device in system.Device.get_all_devices_with_cpu_affinity(i): - affinity = device.cpu_affinity + affinity = device.get_cpu_affinity() assert isinstance(affinity, list) assert i in affinity except system.NotSupportedError: @@ -646,9 +646,13 @@ def test_temperature(): assert isinstance(t, int) assert t >= 0 - margin = temperature.margin - assert isinstance(margin, int) - assert margin >= 0 + try: + margin = temperature.margin + except system.NotSupportedError: + pass + else: + assert isinstance(margin, int) + assert margin >= 0 thermals = temperature.thermal_settings(system.ThermalTarget.ALL) assert isinstance(thermals, system.ThermalSettings) From 32c83aef2e448c34c67970cba314e577a5fa6417 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 15 Jan 2026 11:35:39 -0500 Subject: [PATCH 15/18] Fix test on WDDM Windows --- cuda_core/tests/system/test_system_device.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 7f6adb9884..62e6207d1f 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -604,7 +604,9 @@ def test_fan(): min_, max_ = fan_info.min_max_speed assert isinstance(min_, int) assert isinstance(max_, int) - assert min_ <= speed <= max_ + assert min <= max + if speed > 0: + assert min_ <= speed <= max_ control_policy = fan_info.control_policy assert isinstance(control_policy, system.FanControlPolicy) From 7293544c1465e7a6054efa1cd0adfbd28b7f7968 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 15 Jan 2026 14:24:53 -0500 Subject: [PATCH 16/18] Add comments about pointer lifetime. --- cuda_core/cuda/core/system/_performance.pxi | 2 ++ cuda_core/cuda/core/system/_temperature.pxi | 2 ++ 2 files changed, 4 insertions(+) diff --git a/cuda_core/cuda/core/system/_performance.pxi b/cuda_core/cuda/core/system/_performance.pxi index f39deefe35..6ba1d40f9c 100644 --- a/cuda_core/cuda/core/system/_performance.pxi +++ b/cuda_core/cuda/core/system/_performance.pxi @@ -19,6 +19,8 @@ cdef class GpuDynamicPstatesUtilization: object _owner def __init__(self, ptr: int, owner: object): + # ptr points to a part of the numpy buffer held by `_owner`, so we need + # to maintain a reference to `_owner` to keep it alive. self._ptr = <_GpuDynamicPstatesUtilization *>ptr self._owner = owner diff --git a/cuda_core/cuda/core/system/_temperature.pxi b/cuda_core/cuda/core/system/_temperature.pxi index 75f9efae4f..20e5f6f99e 100644 --- a/cuda_core/cuda/core/system/_temperature.pxi +++ b/cuda_core/cuda/core/system/_temperature.pxi @@ -20,6 +20,8 @@ cdef class ThermalSensor: object _owner def __init__(self, ptr: int, owner: object): + # ptr points to a part of the numpy buffer held by `_owner`, so we need + # to maintain a reference to `_owner` to keep it alive. self._ptr = <_ThermalSensor *>ptr self._owner = owner From 3a621e5bbb5d316afc82ac1a8672a0b82a76427f Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 15 Jan 2026 14:26:02 -0500 Subject: [PATCH 17/18] Fix tests --- cuda_core/tests/system/test_system_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 76eae113c6..2b348dc41f 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -617,7 +617,7 @@ def test_fan(): min_, max_ = fan_info.min_max_speed assert isinstance(min_, int) assert isinstance(max_, int) - assert min <= max + assert min_ <= max_ if speed > 0: assert min_ <= speed <= max_ From 08dcbcc6886063e4264b1cdffc3462d2dd4d0d74 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 16 Jan 2026 10:13:11 -0500 Subject: [PATCH 18/18] Fix Fan.control_policy --- cuda_core/cuda/core/system/_fan.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/system/_fan.pxi b/cuda_core/cuda/core/system/_fan.pxi index 6dccc445da..18525a21b1 100644 --- a/cuda_core/cuda/core/system/_fan.pxi +++ b/cuda_core/cuda/core/system/_fan.pxi @@ -92,7 +92,7 @@ cdef class FanInfo: For all CUDA-capable discrete products with fans. """ - return nvml.device_get_fan_control_policy_v2(self._handle, self._fan) + return FanControlPolicy(nvml.device_get_fan_control_policy_v2(self._handle, self._fan)) def set_default_fan_speed(self): """