diff --git a/cuda_bindings/cuda/bindings/_nvml.pxd b/cuda_bindings/cuda/bindings/_nvml.pxd index a0e6ed9ad9..4dd1c728a2 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pxd +++ b/cuda_bindings/cuda/bindings/_nvml.pxd @@ -14,6 +14,8 @@ from .cy_nvml cimport * ############################################################################### ctypedef nvmlDramEncryptionInfo_v1_t DramEncryptionInfo_v1 +ctypedef nvmlMarginTemperature_v1_t MarginTemperature_v1 +ctypedef nvmlFanSpeedInfo_v1_t FanSpeedInfo_v1 ctypedef nvmlConfComputeSetKeyRotationThresholdInfo_v1_t ConfComputeSetKeyRotationThresholdInfo_v1 ctypedef nvmlSystemDriverBranchInfo_v1_t SystemDriverBranchInfo_v1 ctypedef nvmlTemperature_v1_t Temperature_v1 @@ -196,14 +198,12 @@ cpdef object device_get_supported_graphics_clocks(intptr_t device, unsigned int cpdef tuple device_get_auto_boosted_clocks_enabled(intptr_t device) cpdef unsigned int device_get_fan_speed(intptr_t device) except? 0 cpdef unsigned int device_get_fan_speed_v2(intptr_t device, unsigned int fan) except? 0 -cpdef object device_get_fan_speed_rpm(intptr_t device) cpdef unsigned int device_get_target_fan_speed(intptr_t device, unsigned int fan) except? 0 cpdef tuple device_get_min_max_fan_speed(intptr_t device) cpdef unsigned int device_get_fan_control_policy_v2(intptr_t device, unsigned int fan) except * cpdef unsigned int device_get_num_fans(intptr_t device) except? 0 cpdef object device_get_cooler_info(intptr_t device) cpdef unsigned int device_get_temperature_threshold(intptr_t device, int threshold_type) except? 0 -cpdef object device_get_margin_temperature(intptr_t device) cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_ind_ex) cpdef int device_get_performance_state(intptr_t device) except? -1 cpdef unsigned long long device_get_current_clocks_event_reasons(intptr_t device) except? 0 @@ -214,7 +214,6 @@ cpdef int device_get_mem_clk_vf_offset(intptr_t device) except? 0 cpdef tuple device_get_min_max_clock_of_p_state(intptr_t device, int type, int pstate) cpdef tuple device_get_gpc_clk_min_max_vf_offset(intptr_t device) cpdef tuple device_get_mem_clk_min_max_vf_offset(intptr_t device) -cpdef object device_get_clock_offsets(intptr_t device) cpdef device_set_clock_offsets(intptr_t device, intptr_t info) cpdef object device_get_performance_modes(intptr_t device) cpdef object device_get_current_clock_freqs(intptr_t device) diff --git a/cuda_bindings/cuda/bindings/_nvml.pyx b/cuda_bindings/cuda/bindings/_nvml.pyx index dbb87e8d0b..ea8f56dc45 100644 --- a/cuda_bindings/cuda/bindings/_nvml.pyx +++ b/cuda_bindings/cuda/bindings/_nvml.pyx @@ -1198,16 +1198,16 @@ class PowerMizerMode(_IntEnum): class DeviceArch(_IntEnum): - DEVICE_ARCH_KEPLER = 2 - DEVICE_ARCH_MAXWELL = 3 - DEVICE_ARCH_PASCAL = 4 - DEVICE_ARCH_VOLTA = 5 - DEVICE_ARCH_TURING = 6 - DEVICE_ARCH_AMPERE = 7 - DEVICE_ARCH_ADA = 8 - DEVICE_ARCH_HOPPER = 9 - DEVICE_ARCH_BLACKWELL = 10 - DEVICE_ARCH_UNKNOWN = 0xFFFFFFFF + KEPLER = 2 + MAXWELL = 3 + PASCAL = 4 + VOLTA = 5 + TURING = 6 + AMPERE = 7 + ADA = 8 + HOPPER = 9 + BLACKWELL = 10 + UNKNOWN = 0xFFFFFFFF class BusType(_IntEnum): @@ -1361,10 +1361,9 @@ class SystemEventType(_IntEnum): SYSTEM_EVENT_TYPE_GPU_DRIVER_BIND = 0x0000000000000002 -class ClocksEvent(_IntEnum): +class ClocksEventReasons(_IntEnum): CLOCKS_EVENT_REASON_GPU_IDLE = 0x0000000000000001 CLOCKS_EVENT_REASON_APPLICATIONS_CLOCKS_SETTING = 0x0000000000000002 - CLOCKS_THROTTLE_REASON_USER_DEFINED_CLOCKS = 0x0000000000000002 CLOCKS_EVENT_REASON_SW_POWER_CAP = 0x0000000000000004 CLOCKS_THROTTLE_REASON_HW_SLOWDOWN = 0x0000000000000008 CLOCKS_EVENT_REASON_SYNC_BOOST = 0x0000000000000010 @@ -1373,13 +1372,6 @@ class ClocksEvent(_IntEnum): CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE_SLOWDOWN = 0x0000000000000080 CLOCKS_EVENT_REASON_DISPLAY_CLOCK_SETTING = 0x0000000000000100 CLOCKS_EVENT_REASON_NONE = 0x0000000000000000 - CLOCKS_THROTTLE_REASON_GPU_IDLE = 0x0000000000000001 - CLOCKS_THROTTLE_REASON_APPLICATIONS_CLOCKS_SETTING = 0x0000000000002 - CLOCKS_THROTTLE_REASON_SYNC_BOOST = 0x00000000000010 - CLOCKS_THROTTLE_REASON_SW_POWER_CAP = 0x00000000000004 - CLOCKS_THROTTLE_REASON_SW_THERMAL_SLOWDOWN = 0x00000000000020 - CLOCKS_THROTTLE_REASON_DISPLAY_CLOCK_SETTING = 0x00000000000100 - CLOCKS_THROTTLE_REASON_NONE = 0x0000000000000000 class EncoderQuery(_IntEnum): @@ -4295,138 +4287,6 @@ cdef class CoolerInfo_v1: return obj -cdef _get_margin_temperature_v1_dtype_offsets(): - cdef nvmlMarginTemperature_v1_t pod = nvmlMarginTemperature_v1_t() - return _numpy.dtype({ - 'names': ['version', 'margin_temperature'], - 'formats': [_numpy.uint32, _numpy.int32], - 'offsets': [ - (&(pod.version)) - (&pod), - (&(pod.marginTemperature)) - (&pod), - ], - 'itemsize': sizeof(nvmlMarginTemperature_v1_t), - }) - -margin_temperature_v1_dtype = _get_margin_temperature_v1_dtype_offsets() - -cdef class MarginTemperature_v1: - """Empty-initialize an instance of `nvmlMarginTemperature_v1_t`. - - - .. seealso:: `nvmlMarginTemperature_v1_t` - """ - cdef: - nvmlMarginTemperature_v1_t *_ptr - object _owner - bint _owned - bint _readonly - - def __init__(self): - self._ptr = calloc(1, sizeof(nvmlMarginTemperature_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating MarginTemperature_v1") - self._owner = None - self._owned = True - self._readonly = False - - def __dealloc__(self): - cdef nvmlMarginTemperature_v1_t *ptr - if self._owned and self._ptr != NULL: - ptr = self._ptr - self._ptr = NULL - free(ptr) - - def __repr__(self): - return f"<{__name__}.MarginTemperature_v1 object at {hex(id(self))}>" - - @property - def ptr(self): - """Get the pointer address to the data as Python :class:`int`.""" - return (self._ptr) - - cdef intptr_t _get_ptr(self): - return (self._ptr) - - def __int__(self): - return (self._ptr) - - def __eq__(self, other): - cdef MarginTemperature_v1 other_ - if not isinstance(other, MarginTemperature_v1): - return False - other_ = other - return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlMarginTemperature_v1_t)) == 0) - - def __setitem__(self, key, val): - if key == 0 and isinstance(val, _numpy.ndarray): - self._ptr = malloc(sizeof(nvmlMarginTemperature_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating MarginTemperature_v1") - memcpy(self._ptr, val.ctypes.data, sizeof(nvmlMarginTemperature_v1_t)) - self._owner = None - self._owned = True - self._readonly = not val.flags.writeable - else: - setattr(self, key, val) - - @property - def version(self): - """int: The version number of this struct.""" - return self._ptr[0].version - - @version.setter - def version(self, val): - if self._readonly: - raise ValueError("This MarginTemperature_v1 instance is read-only") - self._ptr[0].version = val - - @property - def margin_temperature(self): - """int: The margin temperature value.""" - return self._ptr[0].marginTemperature - - @margin_temperature.setter - def margin_temperature(self, val): - if self._readonly: - raise ValueError("This MarginTemperature_v1 instance is read-only") - self._ptr[0].marginTemperature = val - - @staticmethod - def from_data(data): - """Create an MarginTemperature_v1 instance wrapping the given NumPy array. - - Args: - data (_numpy.ndarray): a single-element array of dtype `margin_temperature_v1_dtype` holding the data. - """ - return __from_data(data, "margin_temperature_v1_dtype", margin_temperature_v1_dtype, MarginTemperature_v1) - - @staticmethod - def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): - """Create an MarginTemperature_v1 instance wrapping the given pointer. - - Args: - ptr (intptr_t): pointer address as Python :class:`int` to the data. - owner (object): The Python object that owns the pointer. If not provided, data will be copied. - readonly (bool): whether the data is read-only (to the user). default is `False`. - """ - if ptr == 0: - raise ValueError("ptr must not be null (0)") - cdef MarginTemperature_v1 obj = MarginTemperature_v1.__new__(MarginTemperature_v1) - if owner is None: - obj._ptr = malloc(sizeof(nvmlMarginTemperature_v1_t)) - if obj._ptr == NULL: - raise MemoryError("Error allocating MarginTemperature_v1") - memcpy((obj._ptr), ptr, sizeof(nvmlMarginTemperature_v1_t)) - obj._owner = None - obj._owned = True - else: - obj._ptr = ptr - obj._owner = owner - obj._owned = False - obj._readonly = readonly - return obj - - cdef _get_clk_mon_fault_info_dtype_offsets(): cdef nvmlClkMonFaultInfo_t pod = nvmlClkMonFaultInfo_t() return _numpy.dtype({ @@ -4753,150 +4613,6 @@ cdef class ClockOffset_v1: return obj -cdef _get_fan_speed_info_v1_dtype_offsets(): - cdef nvmlFanSpeedInfo_v1_t pod = nvmlFanSpeedInfo_v1_t() - return _numpy.dtype({ - 'names': ['version', 'fan', 'speed'], - 'formats': [_numpy.uint32, _numpy.uint32, _numpy.uint32], - 'offsets': [ - (&(pod.version)) - (&pod), - (&(pod.fan)) - (&pod), - (&(pod.speed)) - (&pod), - ], - 'itemsize': sizeof(nvmlFanSpeedInfo_v1_t), - }) - -fan_speed_info_v1_dtype = _get_fan_speed_info_v1_dtype_offsets() - -cdef class FanSpeedInfo_v1: - """Empty-initialize an instance of `nvmlFanSpeedInfo_v1_t`. - - - .. seealso:: `nvmlFanSpeedInfo_v1_t` - """ - cdef: - nvmlFanSpeedInfo_v1_t *_ptr - object _owner - bint _owned - bint _readonly - - def __init__(self): - self._ptr = calloc(1, sizeof(nvmlFanSpeedInfo_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating FanSpeedInfo_v1") - self._owner = None - self._owned = True - self._readonly = False - - def __dealloc__(self): - cdef nvmlFanSpeedInfo_v1_t *ptr - if self._owned and self._ptr != NULL: - ptr = self._ptr - self._ptr = NULL - free(ptr) - - def __repr__(self): - return f"<{__name__}.FanSpeedInfo_v1 object at {hex(id(self))}>" - - @property - def ptr(self): - """Get the pointer address to the data as Python :class:`int`.""" - return (self._ptr) - - cdef intptr_t _get_ptr(self): - return (self._ptr) - - def __int__(self): - return (self._ptr) - - def __eq__(self, other): - cdef FanSpeedInfo_v1 other_ - if not isinstance(other, FanSpeedInfo_v1): - return False - other_ = other - return (memcmp((self._ptr), (other_._ptr), sizeof(nvmlFanSpeedInfo_v1_t)) == 0) - - def __setitem__(self, key, val): - if key == 0 and isinstance(val, _numpy.ndarray): - self._ptr = malloc(sizeof(nvmlFanSpeedInfo_v1_t)) - if self._ptr == NULL: - raise MemoryError("Error allocating FanSpeedInfo_v1") - memcpy(self._ptr, val.ctypes.data, sizeof(nvmlFanSpeedInfo_v1_t)) - self._owner = None - self._owned = True - self._readonly = not val.flags.writeable - else: - setattr(self, key, val) - - @property - def version(self): - """int: the API version number""" - return self._ptr[0].version - - @version.setter - def version(self, val): - if self._readonly: - raise ValueError("This FanSpeedInfo_v1 instance is read-only") - self._ptr[0].version = val - - @property - def fan(self): - """int: the fan index""" - return self._ptr[0].fan - - @fan.setter - def fan(self, val): - if self._readonly: - raise ValueError("This FanSpeedInfo_v1 instance is read-only") - self._ptr[0].fan = val - - @property - def speed(self): - """int: OUT: the fan speed in RPM.""" - return self._ptr[0].speed - - @speed.setter - def speed(self, val): - if self._readonly: - raise ValueError("This FanSpeedInfo_v1 instance is read-only") - self._ptr[0].speed = val - - @staticmethod - def from_data(data): - """Create an FanSpeedInfo_v1 instance wrapping the given NumPy array. - - Args: - data (_numpy.ndarray): a single-element array of dtype `fan_speed_info_v1_dtype` holding the data. - """ - return __from_data(data, "fan_speed_info_v1_dtype", fan_speed_info_v1_dtype, FanSpeedInfo_v1) - - @staticmethod - def from_ptr(intptr_t ptr, bint readonly=False, object owner=None): - """Create an FanSpeedInfo_v1 instance wrapping the given pointer. - - Args: - ptr (intptr_t): pointer address as Python :class:`int` to the data. - owner (object): The Python object that owns the pointer. If not provided, data will be copied. - readonly (bool): whether the data is read-only (to the user). default is `False`. - """ - if ptr == 0: - raise ValueError("ptr must not be null (0)") - cdef FanSpeedInfo_v1 obj = FanSpeedInfo_v1.__new__(FanSpeedInfo_v1) - if owner is None: - obj._ptr = malloc(sizeof(nvmlFanSpeedInfo_v1_t)) - if obj._ptr == NULL: - raise MemoryError("Error allocating FanSpeedInfo_v1") - memcpy((obj._ptr), ptr, sizeof(nvmlFanSpeedInfo_v1_t)) - obj._owner = None - obj._owned = True - else: - obj._ptr = ptr - obj._owner = owner - obj._owned = False - obj._readonly = readonly - return obj - - cdef _get_device_perf_modes_v1_dtype_offsets(): cdef nvmlDevicePerfModes_v1_t pod = nvmlDevicePerfModes_v1_t() return _numpy.dtype({ @@ -22669,26 +22385,6 @@ cpdef unsigned int device_get_fan_speed_v2(intptr_t device, unsigned int fan) ex return speed -cpdef object device_get_fan_speed_rpm(intptr_t device): - """Retrieves the intended operating speed in rotations per minute (RPM) of the device's specified fan. - - Args: - device (intptr_t): The identifier of the target device. - - Returns: - nvmlFanSpeedInfo_v1_t: Structure specifying the index of the target fan (input) and retrieved fan speed value (output). - - .. seealso:: `nvmlDeviceGetFanSpeedRPM` - """ - cdef FanSpeedInfo_v1 fan_speed_py = FanSpeedInfo_v1() - cdef nvmlFanSpeedInfo_t *fan_speed = (fan_speed_py._get_ptr()) - fan_speed.version = sizeof(nvmlFanSpeedInfo_v1_t) | (1 << 24) - with nogil: - __status__ = nvmlDeviceGetFanSpeedRPM(device, fan_speed) - check_status(__status__) - return fan_speed_py - - cpdef unsigned int device_get_target_fan_speed(intptr_t device, unsigned int fan) except? 0: """Retrieves the intended target speed of the device's specified fan. @@ -22806,26 +22502,6 @@ cpdef unsigned int device_get_temperature_threshold(intptr_t device, int thresho return temp -cpdef object device_get_margin_temperature(intptr_t device): - """Retrieves the thermal margin temperature (distance to nearest slowdown threshold). - - Args: - device (intptr_t): The identifier of the target device. - - Returns: - nvmlMarginTemperature_v1_t: Versioned structure in which to return the temperature reading. - - .. seealso:: `nvmlDeviceGetMarginTemperature` - """ - cdef MarginTemperature_v1 margin_temp_info_py = MarginTemperature_v1() - cdef nvmlMarginTemperature_t *margin_temp_info = (margin_temp_info_py._get_ptr()) - margin_temp_info.version = sizeof(nvmlMarginTemperature_v1_t) | (1 << 24) - with nogil: - __status__ = nvmlDeviceGetMarginTemperature(device, margin_temp_info) - check_status(__status__) - return margin_temp_info_py - - cpdef object device_get_thermal_settings(intptr_t device, unsigned int sensor_ind_ex): """Used to execute a list of thermal system instructions. @@ -23023,26 +22699,6 @@ cpdef tuple device_get_mem_clk_min_max_vf_offset(intptr_t device): return (min_offset, max_offset) -cpdef object device_get_clock_offsets(intptr_t device): - """Retrieve min, max and current clock offset of some clock domain for a given PState. - - Args: - device (intptr_t): The identifier of the target device. - - Returns: - nvmlClockOffset_v1_t: Structure specifying the clock type (input) and the pstate (input) retrieved clock offset value (output), min clock offset (output) and max clock offset (output). - - .. seealso:: `nvmlDeviceGetClockOffsets` - """ - cdef ClockOffset_v1 info_py = ClockOffset_v1() - cdef nvmlClockOffset_t *info = (info_py._get_ptr()) - info.version = sizeof(nvmlClockOffset_v1_t) | (1 << 24) - with nogil: - __status__ = nvmlDeviceGetClockOffsets(device, info) - check_status(__status__) - return info_py - - cpdef device_set_clock_offsets(intptr_t device, intptr_t info): """Control current clock offset of some clock domain for a given PState. @@ -27173,7 +26829,7 @@ cpdef object device_get_topology_nearest_gpus(intptr_t device, unsigned int leve check_status_size(__status__) if count[0] == 0: return view.array(shape=(1,), itemsize=sizeof(intptr_t), format="P", mode="c")[:0] - cdef view.array deviceArray = view.array(shape=(deviceCount[0],), itemsize=sizeof(intptr_t), format="P", mode="c") + cdef view.array deviceArray = view.array(shape=(count[0],), itemsize=sizeof(intptr_t), format="P", mode="c") with nogil: __status__ = nvmlDeviceGetTopologyNearestGpus( device, @@ -27206,15 +26862,13 @@ cpdef object device_get_temperature_v(intptr_t device, nvmlTemperatureSensors_t return temperature.temperature -cpdef object device_get_supported_performance_states(intptr_t device, unsigned int size): +cpdef object device_get_supported_performance_states(intptr_t device): """Get all supported Performance States (P-States) for the device. Args: device (Device): The identifier of the target device. - size (unsigned int): The number of states to return. """ - if size == 0: - return view.array(shape=(1,), itemsize=sizeof(unsigned int), format="I", mode="c")[:0] + cdef int size = 16 # NVML_MAX_GPU_PERF_STATES cdef view.array pstates = view.array(shape=(size,), itemsize=sizeof(unsigned int), format="I", mode="c") # The header says "size is the size of the pstates array in bytes". @@ -28066,3 +27720,65 @@ cpdef object system_event_set_wait(intptr_t event_set, unsigned int timeout_ms, check_status(__status__) event_data._data.resize((request[0].numEvent,)) return event_data + + +cpdef unsigned int device_get_fan_speed_rpm(intptr_t device, unsigned int fan): + """Retrieves the intended operating speed in rotations per minute (RPM) of the device's specified fan. + + Args: + device (intptr_t): The identifier of the target device. + fan (unsigned int): The index of the fan to query. + + Returns: + rpm (unsigned int): The fan speed in RPM. + + .. seealso:: `nvmlDeviceGetFanSpeedRPM` + """ + cdef nvmlFanSpeedInfo_v1_t[1] fan_speed + fan_speed[0].version = sizeof(nvmlFanSpeedInfo_v1_t) | (1 << 24) + fan_speed[0].fan = fan + with nogil: + __status__ = nvmlDeviceGetFanSpeedRPM(device, fan_speed) + check_status(__status__) + return fan_speed[0].speed + + +cpdef int device_get_margin_temperature(intptr_t device): + """Retrieves the thermal margin temperature (distance to nearest slowdown threshold). + + Args: + device (intptr_t): The identifier of the target device. + + Returns: + margin_temperature (int): The margin temperature value. + + .. seealso:: `nvmlDeviceGetMarginTemperature` + """ + cdef nvmlMarginTemperature_v1_t[1] margin_temp_info + margin_temp_info[0].version = sizeof(nvmlMarginTemperature_v1_t) | (1 << 24) + with nogil: + __status__ = nvmlDeviceGetMarginTemperature(device, margin_temp_info) + check_status(__status__) + return margin_temp_info[0].marginTemperature + + +cpdef object device_get_clock_offsets(intptr_t device, nvmlClockType_t clock_type, nvmlPstates_t pstate): + """Retrieve min, max and current clock offset of some clock domain for a given PState. + + Args: + device (intptr_t): The identifier of the target device. + + Returns: + nvmlClockOffset_v1_t: Structure specifying the clock type (input) and the pstate (input) retrieved clock offset value (output), min clock offset (output) and max clock offset (output). + + .. seealso:: `nvmlDeviceGetClockOffsets` + """ + cdef ClockOffset_v1 info_py = ClockOffset_v1() + cdef nvmlClockOffset_v1_t *info = (info_py._get_ptr()) + info.version = sizeof(nvmlClockOffset_v1_t) | (1 << 24) + info.type = clock_type + info.pstate = pstate + with nogil: + __status__ = nvmlDeviceGetClockOffsets(device, info) + check_status(__status__) + return info_py diff --git a/cuda_core/cuda/core/system/_clock.pxi b/cuda_core/cuda/core/system/_clock.pxi new file mode 100644 index 0000000000..911ef4ce72 --- /dev/null +++ b/cuda_core/cuda/core/system/_clock.pxi @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class ClockOffsets: + """ + Contains clock offset information. + """ + + cdef object _clock_offset + + def __init__(self, clock_offset: nvml.ClockOffset): + self._clock_offset = clock_offset + + @property + def clock_offset_mhz(self) -> int: + """ + The current clock offset in MHz. + """ + return self._clock_offset.clock_offset_m_hz + + @property + def max_offset_mhz(self) -> int: + """ + The maximum clock offset in MHz. + """ + return self._clock_offset.max_clock_offset_m_hz + + @property + def min_offset_mhz(self) -> int: + """ + The minimum clock offset in MHz. + """ + return self._clock_offset.min_clock_offset_m_hz + + +cdef class ClockInfo: + """ + Accesses various clock information about a device. + """ + + cdef intptr_t _handle + cdef int _clock_type + + def __init__(self, handle, clock_type: ClockType): + self._handle = handle + self._clock_type = int(clock_type) + + def get_current_mhz(self, clock_id: ClockId = ClockId.CURRENT) -> int: + """ + Get the current clock speed of a specific clock domain, in MHz. + + For Kepler™ or newer fully supported devices. + + Parameters + ---------- + clock_id: :class:`ClockId` + The clock ID to query. + + Returns + ------- + int + The clock speed in MHz. + """ + return nvml.device_get_clock(self._handle, self._clock_type, clock_id) + + def get_max_mhz(self) -> int: + """ + Get the maximum clock speed of a specific clock domain, in MHz. + + For Fermi™ or newer fully supported devices. + + Current P0 clocks (reported by :meth:`get_current_mhz` can differ from + max clocks by a few MHz. + + Returns + ------- + int + The maximum clock speed in MHz. + """ + return nvml.device_get_max_clock_info(self._handle, self._clock_type) + + def get_max_customer_boost_mhz(self) -> int: + """ + Get the maximum customer boost clock speed of a specific clock, in MHz. + + For Pascal™ or newer fully supported devices. + + Returns + ------- + int + The maximum customer boost clock speed in MHz. + """ + return nvml.device_get_max_customer_boost_clock(self._handle, self._clock_type) + + def get_min_max_clock_of_pstate_mhz(self, pstate: Pstates) -> tuple[int, int]: + """ + Get the minimum and maximum clock speeds for this clock domain + at a given performance state (Pstate), in MHz. + + Parameters + ---------- + pstate: :class:`Pstates` + The performance state to query. + + Returns + ------- + tuple[int, int] + A tuple containing the minimum and maximum clock speeds in MHz. + """ + return nvml.device_get_min_max_clock_of_p_state(self._handle, self._clock_type, pstate) + + def get_offsets(self, pstate: Pstates) -> ClockOffsets: + """ + Retrieve min, max and current clock offset of some clock domain for a given Pstate. + + For Maxwell™ or newer fully supported devices. + + Parameters + ---------- + pstate: :class:`Pstates` + The performance state to query. + + Returns + ------- + ClockOffsets + An object with the min, max and current clock offset. + """ + return ClockOffsets(nvml.device_get_clock_offsets(self._handle, self._clock_type, pstate)) diff --git a/cuda_core/cuda/core/system/_cooler.pxi b/cuda_core/cuda/core/system/_cooler.pxi new file mode 100644 index 0000000000..4d49f7ae9e --- /dev/null +++ b/cuda_core/cuda/core/system/_cooler.pxi @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class CoolerInfo: + cdef object _cooler_info + + def __init__(self, cooler_info: nvml.CoolerInfo): + self._cooler_info = cooler_info + + @property + def signal_type(self) -> CoolerControl: + """ + The cooler's control signal characteristics. + + The possible types are restricted, variable and toggle. See + :class:`CoolerControl` for details. + """ + return CoolerControl(self._cooler_info.signal_type) + + @property + def target(self) -> list[CoolerTarget]: + """ + The target that cooler controls. + + Targets may be GPU, Memory, Power Supply, or all of these. See + :class:`CoolerTarget` for details. + """ + cdef uint64_t[1] targets = [self._cooler_info.target] + return [CoolerTarget(1 << ev) for ev in _unpack_bitmask(targets)] diff --git a/cuda_core/cuda/core/system/_device.pyx b/cuda_core/cuda/core/system/_device.pyx index b013ef79ca..71cb35b907 100644 --- a/cuda_core/cuda/core/system/_device.pyx +++ b/cuda_core/cuda/core/system/_device.pyx @@ -12,19 +12,37 @@ from cuda.bindings import _nvml as nvml from ._nvml_context cimport initialize -include "_device_utils.pxi" -include "_inforom.pxi" - AddressingMode = nvml.DeviceAddressingModeType +AffinityScope = nvml.AffinityScope BrandType = nvml.BrandType +ClockId = nvml.ClockId +ClocksEventReasons = nvml.ClocksEventReasons +ClockType = nvml.ClockType +CoolerControl = nvml.CoolerControl +CoolerTarget = nvml.CoolerTarget EventType = nvml.EventType +FanControlPolicy = nvml.FanControlPolicy FieldId = nvml.FieldId GpuP2PCapsIndex = nvml.GpuP2PCapsIndex GpuP2PStatus = nvml.GpuP2PStatus GpuTopologyLevel = nvml.GpuTopologyLevel InforomObject = nvml.InforomObject PcieUtilCounter = nvml.PcieUtilCounter +Pstates = nvml.Pstates +TemperatureSensors = nvml.TemperatureSensors +TemperatureThresholds = nvml.TemperatureThresholds +ThermalController = nvml.ThermalController +ThermalTarget = nvml.ThermalTarget + + +include "_clock.pxi" +include "_cooler.pxi" +include "_device_utils.pxi" +include "_fan.pxi" +include "_inforom.pxi" +include "_performance.pxi" +include "_temperature.pxi" class DeviceArchitecture: @@ -752,6 +770,187 @@ cdef class Device: device._handle = handle yield device + def get_memory_affinity(self, scope: AffinityScope=AffinityScope.NODE) -> list[int]: + """ + Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal + memory affinity for the device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + + If requested scope is not applicable to the target topology, the API + will fall back to reporting the memory affinity for the immediate non-I/O + ancestor of the device. + """ + return _unpack_bitmask( + nvml.device_get_memory_affinity( + self._handle, + ceil(cpu_count() / 64), + scope + ) + ) + + def get_cpu_affinity(self, scope: AffinityScope=AffinityScope.NODE) -> list[int]: + """ + Retrieves a list of indices of NUMA nodes or CPU sockets with the ideal + CPU affinity for the device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + + If requested scope is not applicable to the target topology, the API + will fall back to reporting the memory affinity for the immediate non-I/O + ancestor of the device. + """ + return _unpack_bitmask( + nvml.device_get_cpu_affinity_within_scope( + self._handle, + ceil(cpu_count() / 64), + scope, + ) + ) + + def set_cpu_affinity(self): + """ + Sets the ideal affinity for the calling thread and device. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + """ + nvml.device_set_cpu_affinity(self._handle) + + def clear_cpu_affinity(self): + """ + Clear all affinity bindings for the calling thread. + + For Kepler™ or newer fully supported devices. + + Supported on Linux only. + """ + nvml.device_clear_cpu_affinity(self._handle) + + @property + def numa_node_id(self) -> int: + """ + The NUMA node of the given GPU device. + + This only applies to platforms where the GPUs are NUMA nodes. + """ + return nvml.device_get_numa_node_id(self._handle) + + def clock(self, clock_type: ClockType) -> ClockInfo: + """ + Get information about and manage a specific clock on a device. + """ + return ClockInfo(self._handle, clock_type) + + def get_auto_boosted_clocks_enabled(self) -> tuple[bool, bool]: + """ + Retrieve the current state of auto boosted clocks on a device. + + For Kepler™ or newer fully supported devices. + + Auto Boosted clocks are enabled by default on some hardware, allowing + the GPU to run at higher clock rates to maximize performance as thermal + limits allow. + + On Pascal™ and newer hardware, Auto Boosted clocks are controlled + through application clocks. Use :meth:`set_application_clocks` and + :meth:`reset_application_clocks` to control Auto Boost behavior. + + Returns + ------- + bool + The current state of Auto Boosted clocks + bool + The default Auto Boosted clocks behavior + + """ + current, default = nvml.device_get_auto_boosted_clocks_enabled(self._handle) + return current == nvml.EnableState.FEATURE_ENABLED, default == nvml.EnableState.FEATURE_ENABLED + + def get_current_clock_event_reasons(self) -> list[ClocksEventReasons]: + """ + Retrieves the current clocks event reasons. + + For all fully supported products. + """ + cdef uint64_t[1] reasons + reasons[0] = nvml.device_get_current_clocks_event_reasons(self._handle) + return [ClocksEventReasons(1 << reason) for reason in _unpack_bitmask(reasons)] + + def get_supported_clock_event_reasons(self) -> list[ClocksEventReasons]: + """ + Retrieves supported clocks event reasons that can be returned by + :meth:`get_current_clock_event_reasons`. + + For all fully supported products. + + This method is not supported in virtual machines running virtual GPU (vGPU). + """ + cdef uint64_t[1] reasons + reasons[0] = nvml.device_get_supported_clocks_event_reasons(self._handle) + return [ClocksEventReasons(1 << reason) for reason in _unpack_bitmask(reasons)] + + def fan(self, fan: int = 0) -> FanInfo: + """ + Get information and manage a specific fan on a device. + """ + if fan < 0 or fan >= self.num_fans: + raise ValueError(f"Fan index {fan} is out of range [0, {self.num_fans})") + return FanInfo(self._handle, fan) + + @property + def num_fans(self) -> int: + """ + The number of fans on the device. + """ + return nvml.device_get_num_fans(self._handle) + + @property + def cooler(self) -> CoolerInfo: + """ + Get information about cooler on a device. + """ + return CoolerInfo(nvml.device_get_cooler_info(self._handle)) + + @property + def temperature(self) -> Temperature: + """ + Get information about temperatures on a device. + """ + return Temperature(self._handle) + + @property + def performance_state(self) -> Pstates: + """ + The current performance state of the device. + + For Fermi™ or newer fully supported devices. + + See :class:`Pstates` for possible performance states. + """ + return Pstates(nvml.device_get_performance_state(self._handle)) + + @property + def dynamic_pstates_info(self) -> GpuDynamicPstatesInfo: + """ + Retrieve performance monitor samples from the associated subdevice. + """ + return GpuDynamicPstatesInfo(nvml.device_get_dynamic_pstates_info(self._handle)) + + def get_supported_pstates(self) -> list[Pstates]: + """ + Get all supported Performance States (P-States) for the device. + + The returned list contains a contiguous list of valid P-States supported by + the device. + """ + return [Pstates(x) for x in nvml.device_get_supported_performance_states(self._handle)] + @property def architecture(self) -> DeviceArchitecture: """ @@ -775,22 +974,6 @@ cdef class Device: """ return BAR1MemoryInfo(nvml.device_get_bar1_memory_info(self._handle)) - @property - def cpu_affinity(self) -> list[int]: - """ - Get a list containing the CPU indices to which the GPU is directly connected. - - Examples - -------- - >>> Device(index=0).cpu_affinity - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59] - """ - return _unpack_bitmask(nvml.device_get_cpu_affinity( - self._handle, - ceil(cpu_count() / 64), - )) - @property def cuda_compute_capability(self) -> tuple[int, int]: """ @@ -1038,6 +1221,130 @@ cdef class Device: device._handle = handle yield device + @property + def index(self) -> int: + """ + The NVML index of this device. + + Valid indices are derived from the count returned by + :meth:`Device.get_device_count`. For example, if ``get_device_count()`` + returns 2, the valid indices are 0 and 1, corresponding to GPU 0 and GPU + 1. + + The order in which NVML enumerates devices has no guarantees of + consistency between reboots. For that reason, it is recommended that + devices be looked up by their PCI ids or GPU UUID. + + Note: The NVML index may not correlate with other APIs, such as the CUDA + device index. + """ + return nvml.device_get_index(self._handle) + + @property + def module_id(self) -> int: + """ + Get a unique identifier for the device module on the baseboard. + + This API retrieves a unique identifier for each GPU module that exists + on a given baseboard. For non-baseboard products, this ID would always + be 0. + """ + return nvml.device_get_module_id(self._handle) + + @property + def minor_number(self) -> int: + """ + The minor number of this device. + + For Linux only. + + The minor number is used by the Linux device driver to identify the + device node in ``/dev/nvidiaX``. + """ + return nvml.device_get_minor_number(self._handle) + + @property + def addressing_mode(self) -> AddressingMode: + """ + Get the addressing mode of the device. + + Addressing modes can be one of: + + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_HMM`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + software-based mirroring of the CPU's page tables, on the GPU. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_ATS`: System allocated + memory (``malloc``, ``mmap``) is addressable from the device (GPU), via + Address Translation Services. This means that there is (effectively) a + single set of page tables, and the CPU and GPU both use them. + - :attr:`AddressingMode.DEVICE_ADDRESSING_MODE_NONE`: Neither HMM nor ATS + is active. + """ + return AddressingMode(nvml.device_get_addressing_mode(self._handle).value) + + @property + def display_mode(self) -> bool: + """ + The display mode for this device. + + Indicates whether a physical display (e.g. monitor) is currently connected to + any of the device's connectors. + """ + return True if nvml.device_get_display_mode(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def display_active(self) -> bool: + """ + The display active status for this device. + + Indicates whether a display is initialized on the device. For example, + whether X Server is attached to this device and has allocated memory for + the screen. + + Display can be active even when no monitor is physically attached. + """ + return True if nvml.device_get_display_active(self._handle) == nvml.EnableState.FEATURE_ENABLED else False + + @property + def repair_status(self) -> RepairStatus: + """ + Get the repair status for TPC/Channel repair. + + For Ampere™ or newer fully supported devices. + """ + return RepairStatus(self._handle) + + @property + def inforom(self) -> InforomInfo: + """ + Accessor for InfoROM information. + + For all products with an InfoROM. + """ + return InforomInfo(self) + + def get_topology_nearest_gpus(self, level: GpuTopologyLevel) -> Iterable[Device]: + """ + Retrieve the GPUs that are nearest to this device at a specific interconnectivity level. + + Supported on Linux only. + + Parameters + ---------- + level: :class:`GpuTopologyLevel` + The topology level. + + Returns + ------- + Iterable of :class:`Device` + The nearest devices at the given topology level. + """ + cdef Device device + for handle in nvml.device_get_topology_nearest_gpus(self._handle, level): + device = Device.__new__(Device) + device._handle = handle + yield device + @property def attributes(self) -> DeviceAttributes: """ @@ -1167,17 +1474,32 @@ def get_p2p_status(device1: Device, device2: Device, index: GpuP2PCapsIndex) -> __all__ = [ "AddressingMode", + "AffinityScope", "BAR1MemoryInfo", "BrandType", + "ClockId", + "ClockInfo", + "ClockOffsets", + "ClocksEventReasons", + "ClockType", + "CoolerControl", + "CoolerInfo", + "CoolerTarget", "Device", "DeviceArchitecture", "DeviceAttributes", "DeviceEvents", "EventData", "EventType", + "FanControlPolicy", + "FanInfo", "FieldId", "FieldValue", "FieldValues", + "get_p2p_status", + "get_topology_common_ancestor", + "GpuDynamicPstatesInfo", + "GpuDynamicPstatesUtilization", "GpuP2PCapsIndex", "GpuP2PStatus", "GpuTopologyLevel", @@ -1186,7 +1508,13 @@ __all__ = [ "MemoryInfo", "PcieUtilCounter", "PciInfo", + "Pstates", "RepairStatus", - "get_p2p_status", - "get_topology_common_ancestor", + "Temperature", + "TemperatureSensors", + "TemperatureThresholds", + "ThermalController", + "ThermalSensor", + "ThermalSettings", + "ThermalTarget", ] diff --git a/cuda_core/cuda/core/system/_fan.pxi b/cuda_core/cuda/core/system/_fan.pxi new file mode 100644 index 0000000000..18525a21b1 --- /dev/null +++ b/cuda_core/cuda/core/system/_fan.pxi @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class FanInfo: + """ + Manages information related to a specific fan on a specific device. + """ + + cdef intptr_t _handle + cdef int _fan + + def __init__(self, handle: int, fan: int): + self._handle = handle + self._fan = fan + + @property + def speed(self) -> int: + """ + Get/set the intended operating speed of the device's fan. + + For all discrete products with dedicated fans. + + Note: The reported speed is the intended fan speed. If the fan is + physically blocked and unable to spin, the output will not match the + actual fan speed. + + The fan speed is expressed as a percentage of the product's maximum + noise tolerance fan speed. This value may exceed 100% in certain cases. + """ + return nvml.device_get_fan_speed_v2(self._handle, self._fan) + + @speed.setter + def speed(self, speed: int): + nvml.device_set_fan_speed_v2(self._handle, self._fan, speed) + + @property + def speed_rpm(self) -> int: + """ + The intended operating speed of the device's fan in rotations per minute + (RPM). + + For Maxwell™ or newer fully supported devices. + + For all discrete products with dedicated fans. + + Note: The reported speed is the intended fan speed. If the fan is + physically blocked and unable to spin, the output will not match the + actual fan speed. + """ + return nvml.device_get_fan_speed_rpm(self._handle, self._fan) + + @property + def target_speed(self) -> int: + """ + Retrieves the intended target speed of the device's specified fan. + + For all discrete products with dedicated fans. + + Normally, the driver dynamically adjusts the fan based on + the needs of the GPU. But when user set fan speed using :property:`speed` + the driver will attempt to make the fan achieve the setting in + :property:`speed`. The actual current speed of the fan + is reported in :property:`speed`. + + The fan speed is expressed as a percentage of the product's maximum + noise tolerance fan speed. This value may exceed 100% in certain cases. + """ + return nvml.device_get_target_fan_speed(self._handle, self._fan) + + @property + def min_max_speed(self) -> tuple[int, int]: + """ + Retrieves the minimum and maximum fan speed all of the device's fans. + + For all discrete products with dedicated fans. + + Returns + ------- + tuple[int, int] + A tuple of (min_speed, max_speed) + """ + return nvml.device_get_min_max_fan_speed(self._handle) + + @property + def control_policy(self) -> FanControlPolicy: + """ + The current fan control policy. + + For Maxwell™ or newer fully supported devices. + + For all CUDA-capable discrete products with fans. + """ + return FanControlPolicy(nvml.device_get_fan_control_policy_v2(self._handle, self._fan)) + + def set_default_fan_speed(self): + """ + Set the speed of the fan control policy to default. + + For all CUDA-capable discrete products with fans. + """ + nvml.device_set_default_fan_speed_v2(self._handle, self._fan) diff --git a/cuda_core/cuda/core/system/_performance.pxi b/cuda_core/cuda/core/system/_performance.pxi new file mode 100644 index 0000000000..6ba1d40f9c --- /dev/null +++ b/cuda_core/cuda/core/system/_performance.pxi @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +# In cuda.bindings.nvml, this is an anonymous struct inside nvmlGpuDynamicPstatesInfo_t. + + +ctypedef struct _GpuDynamicPstatesUtilization: + unsigned int bIsPresent + unsigned int percentage + unsigned int incThreshold + unsigned int decThreshold + + +cdef class GpuDynamicPstatesUtilization: + cdef: + _GpuDynamicPstatesUtilization *_ptr + object _owner + + def __init__(self, ptr: int, owner: object): + # ptr points to a part of the numpy buffer held by `_owner`, so we need + # to maintain a reference to `_owner` to keep it alive. + self._ptr = <_GpuDynamicPstatesUtilization *>ptr + self._owner = owner + + @property + def is_present(self) -> bool: + """ + Set if the utilization domain is present on this GPU. + """ + return bool(self._ptr[0].bIsPresent) + + @property + def percentage(self) -> int: + """ + Percentage of time where the domain is considered busy in the last 1-second interval. + """ + return self._ptr[0].percentage + + @property + def inc_threshold(self) -> int: + """ + Utilization threshold that can trigger a perf-increasing P-State change when crossed. + """ + return self._ptr[0].incThreshold + + @property + def dec_threshold(self) -> int: + """ + Utilization threshold that can trigger a perf-decreasing P-State change when crossed. + """ + return self._ptr[0].decThreshold + + +cdef class GpuDynamicPstatesInfo: + """ + Handles performance monitor samples from the device. + """ + cdef object _gpu_dynamic_pstates_info + + def __init__(self, gpu_dynamic_pstates_info: nvml.GpuDynamicPstatesInfo): + self._gpu_dynamic_pstates_info = gpu_dynamic_pstates_info + + def __len__(self): + return nvml.MAX_GPU_UTILIZATIONS + + def __getitem__(self, idx: int) -> GpuDynamicPstatesUtilization: + if idx < 0 or idx >= nvml.MAX_GPU_UTILIZATIONS: + raise IndexError("GPU dynamic P-states index out of range") + return GpuDynamicPstatesUtilization( + self._gpu_dynamic_pstates_info.utilization.ptr + idx * sizeof(_GpuDynamicPstatesUtilization), + self._gpu_dynamic_pstates_info + ) diff --git a/cuda_core/cuda/core/system/_system.pyx b/cuda_core/cuda/core/system/_system.pyx index c29d20dd20..3e15420dc8 100644 --- a/cuda_core/cuda/core/system/_system.pyx +++ b/cuda_core/cuda/core/system/_system.pyx @@ -22,7 +22,7 @@ if CUDA_BINDINGS_NVML_IS_COMPATIBLE: from cuda.bindings import _nvml as nvml # TODO: We need to be even more specific than version numbers for development. # This can be removed once we have a release including everything we need. - for member in ["FieldId"]: + for member in ["FieldId", "ClocksEventReasons"]: if not hasattr(nvml, member): CUDA_BINDINGS_NVML_IS_COMPATIBLE = False break diff --git a/cuda_core/cuda/core/system/_temperature.pxi b/cuda_core/cuda/core/system/_temperature.pxi new file mode 100644 index 0000000000..20e5f6f99e --- /dev/null +++ b/cuda_core/cuda/core/system/_temperature.pxi @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +# In cuda.bindings.nvml, this is an anonymous struct inside nvmlThermalSettings_t. + + +ctypedef struct _ThermalSensor: + int controller + int defaultMinTemp + int defaultMaxTemp + int currentTemp + int target + + +cdef class ThermalSensor: + cdef: + _ThermalSensor *_ptr + object _owner + + def __init__(self, ptr: int, owner: object): + # ptr points to a part of the numpy buffer held by `_owner`, so we need + # to maintain a reference to `_owner` to keep it alive. + self._ptr = <_ThermalSensor *>ptr + self._owner = owner + + @property + def controller(self) -> ThermalController: + return ThermalController(self._ptr[0].controller) + + @property + def default_min_temp(self) -> int: + return self._ptr[0].defaultMinTemp + + @property + def default_max_temp(self) -> int: + return self._ptr[0].defaultMaxTemp + + @property + def current_temp(self) -> int: + return self._ptr[0].currentTemp + + @property + def target(self) -> ThermalTarget: + return ThermalTarget(self._ptr[0].target) + + +cdef class ThermalSettings: + cdef object _thermal_settings + + def __init__(self, thermal_settings: nvml.ThermalSettings): + self._thermal_settings = thermal_settings + + def __len__(self): + # MAX_THERMAL_SENSORS_PER_GPU is 3 + return min(self._thermal_settings.count, 3) + + def __getitem__(self, idx: int) -> nvml.ThermalSensor: + if idx < 0 or idx >= len(self): + raise IndexError("Thermal sensor index out of range") + return ThermalSensor( + self._thermal_settings.sensor.ptr + idx * sizeof(_ThermalSensor), + self._thermal_settings + ) + + +cdef class Temperature: + cdef intptr_t _handle + + def __init__(self, handle: int): + self._handle = handle + + def sensor( + self, + sensor: TemperatureSensors = TemperatureSensors.TEMPERATURE_GPU + ) -> int: + """ + Get the temperature reading from a specific sensor on the device, in + degrees Celsius. + + Parameters + ---------- + sensor: :class:`TemperatureSensors`, optional + The temperature sensor to query. + + Returns + ------- + int + The temperature in degrees Celsius. + """ + return nvml.device_get_temperature_v(self._handle, sensor) + + def threshold(self, threshold_type: TemperatureThresholds) -> int: + """ + Retrieves the temperature threshold for this GPU with the specified + threshold type, in degrees Celsius. + + For Kepler™ or newer fully supported devices. + + See :class:`TemperatureThresholds` for possible threshold types. + + Note: This API is no longer the preferred interface for retrieving the + following temperature thresholds on Ada and later architectures: + ``NVML_TEMPERATURE_THRESHOLD_SHUTDOWN``, + ``NVML_TEMPERATURE_THRESHOLD_SLOWDOWN``, + ``NVML_TEMPERATURE_THRESHOLD_MEM_MAX`` and + ``NVML_TEMPERATURE_THRESHOLD_GPU_MAX``. + + Support for reading these temperature thresholds for Ada and later + architectures would be removed from this API in future releases. Please + use :meth:`get_field_values` with ``NVML_FI_DEV_TEMPERATURE_*`` fields + to retrieve temperature thresholds on these architectures. + """ + return nvml.device_get_temperature_threshold(self._handle, threshold_type) + + @property + def margin(self) -> int: + """ + The thermal margin temperature (distance to nearest slowdown threshold) for the device. + """ + return nvml.device_get_margin_temperature(self._handle) + + def thermal_settings(self, sensor_index: ThermalTarget) -> ThermalSettings: + """ + Used to execute a list of thermal system instructions. + + TODO: The above docstring is from the NVML header, but it doesn't seem to make sense. + + Parameters + ---------- + sensor_index: ThermalTarget + The index of the thermal sensor. + + Returns + ------- + :class:`ThermalSettings` + The thermal settings for the specified sensor. + """ + return ThermalSettings(nvml.device_get_thermal_settings(self._handle, sensor_index)) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 1c10bb7298..9772b78786 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -92,16 +92,29 @@ CUDA system information and NVIDIA Management Library (NVML) system.Device system.AddressingMode + system.AffinityScope system.BAR1MemoryInfo system.BrandType + system.ClockId + system.ClockInfo + system.ClockOffsets + system.ClocksEventReasons + system.ClockType + system.CoolerControl + system.CoolerInfo + system.CoolerTarget system.DeviceArchitecture system.DeviceAttributes system.DeviceEvents system.EventData system.EventType + system.FanControlPolicy + system.FanInfo system.FieldId system.FieldValue system.FieldValues + system.GpuDynamicPstatesInfo + system.GpuDynamicPstatesUtilization system.GpuP2PCapsIndex system.GpuP2PStatus system.GpuTopologyLevel @@ -110,7 +123,15 @@ CUDA system information and NVIDIA Management Library (NVML) system.MemoryInfo system.PcieUtilCounter system.PciInfo + system.Pstates system.RepairStatus + system.Temperature + system.TemperatureSensors + system.TemperatureThresholds + system.ThermalController + system.ThermalSensor + system.ThermalSettings + system.ThermalTarget .. module:: cuda.core.utils diff --git a/cuda_core/tests/system/test_system_device.py b/cuda_core/tests/system/test_system_device.py index 8f07b2ee27..2e762ce860 100644 --- a/cuda_core/tests/system/test_system_device.py +++ b/cuda_core/tests/system/test_system_device.py @@ -64,11 +64,12 @@ def test_device_bar1_memory(): assert free + used == total +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") def test_device_cpu_affinity(): skip_reasons = set() for device in system.Device.get_all_devices(): try: - affinity = device.cpu_affinity + affinity = device.get_cpu_affinity(system.AffinityScope.NODE) except system.NotSupportedError: skip_reasons.add(f"CPU affinity not supported on '{device.name}'") else: @@ -79,6 +80,42 @@ def test_device_cpu_affinity(): pytest.skip(" ; ".join(skip_reasons)) +@pytest.mark.skipif(helpers.IS_WSL or helpers.IS_WINDOWS, reason="Device attributes not supported on WSL or Windows") +def test_affinity(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + for scope in (system.AffinityScope.NODE, system.AffinityScope.SOCKET): + try: + affinity = device.get_cpu_affinity(scope) + except system.NotSupportedError: + skip_reasons.add(f"CPU affinity not supported on '{device.name}'") + else: + assert isinstance(affinity, list) + + try: + affinity = device.get_memory_affinity(scope) + except system.NotSupportedError: + skip_reasons.add(f"Memory affinity not supported on '{device.name}'") + else: + assert isinstance(affinity, list) + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + +def test_numa_node_id(): + skip_reasons = set() + for device in system.Device.get_all_devices(): + try: + numa_node_id = device.numa_node_id + except system.NotSupportedError: + skip_reasons.add(f"NUMA node ID not supported by device '{device.name}'") + else: + assert isinstance(numa_node_id, int) + assert numa_node_id >= -1 + if skip_reasons: + pytest.skip(" ; ".join(skip_reasons)) + + def test_device_cuda_compute_capability(): for device in system.Device.get_all_devices(): cuda_compute_capability = device.cuda_compute_capability @@ -390,7 +427,7 @@ def test_get_all_devices_with_cpu_affinity(): try: for i in range(multiprocessing.cpu_count()): for device in system.Device.get_all_devices_with_cpu_affinity(i): - affinity = device.cpu_affinity + affinity = device.get_cpu_affinity() assert isinstance(affinity, list) assert i in affinity except system.NotSupportedError: @@ -522,3 +559,179 @@ def test_get_inforom_version(): assert len(board_part_number) > 0 inforom.validate() + + +def test_clock(): + for device in system.Device.get_all_devices(): + try: + current, default = device.get_auto_boosted_clocks_enabled() + except system.NotSupportedError: + pass + else: + assert isinstance(current, bool) + assert isinstance(default, bool) + + for clock_type in system.ClockType: + clock = device.clock(clock_type) + assert isinstance(clock, system.ClockInfo) + + try: + current_mhz = clock.get_current_mhz() + except system.NotSupportedError: + continue + assert isinstance(current_mhz, int) + assert current_mhz >= 0 + + current_mhz = clock.get_current_mhz(system.ClockId.CURRENT) + assert isinstance(current_mhz, int) + assert current_mhz >= 0 + + max_mhz = clock.get_max_mhz() + assert isinstance(max_mhz, int) + assert max_mhz >= 0 + + try: + max_customer_boost = clock.get_max_customer_boost_mhz() + except system.NotSupportedError: + pass + else: + assert isinstance(max_customer_boost, int) + assert max_customer_boost >= 0 + + pstate = device.performance_state + + min_, max_ = clock.get_min_max_clock_of_pstate_mhz(pstate) + assert isinstance(min_, int) + assert min_ >= 0 + assert isinstance(max_, int) + assert max_ >= 0 + + try: + offsets = clock.get_offsets(pstate) + except system.InvalidArgumentError: + offsets = system.ClockOffsets(nvml.ClockOffset_v1()) + assert isinstance(offsets, system.ClockOffsets) + assert isinstance(offsets.clock_offset_mhz, int) + assert isinstance(offsets.max_offset_mhz, int) + assert isinstance(offsets.min_offset_mhz, int) + + +def test_clock_event_reasons(): + for device in system.Device.get_all_devices(): + reasons = device.get_current_clock_event_reasons() + assert all(isinstance(reason, system.ClocksEventReasons) for reason in reasons) + + reasons = device.get_supported_clock_event_reasons() + assert all(isinstance(reason, system.ClocksEventReasons) for reason in reasons) + + +def test_fan(): + for device in system.Device.get_all_devices(): + for fan_idx in range(device.num_fans): + fan_info = device.fan(fan_idx) + assert isinstance(fan_info, system.FanInfo) + + try: + speed = fan_info.speed + assert isinstance(speed, int) + assert 0 <= speed <= 200 + + fan_info.speed = 50 + fan_info.speed = speed + + speed_rpm = fan_info.speed_rpm + assert isinstance(speed_rpm, int) + assert speed_rpm >= 0 + + target_speed = fan_info.target_speed + assert isinstance(target_speed, int) + assert speed <= target_speed * 2 + + min_, max_ = fan_info.min_max_speed + assert isinstance(min_, int) + assert isinstance(max_, int) + assert min_ <= max_ + if speed > 0: + assert min_ <= speed <= max_ + + control_policy = fan_info.control_policy + assert isinstance(control_policy, system.FanControlPolicy) + finally: + fan_info.set_default_fan_speed() + + +def test_cooler(): + for device in system.Device.get_all_devices(): + try: + cooler_info = device.cooler + except system.NotSupportedError: + pytest.skip("CoolerInfo not supported on this device") + + assert isinstance(cooler_info, system.CoolerInfo) + + signal_type = cooler_info.signal_type + assert isinstance(signal_type, system.CoolerControl) + + target = cooler_info.target + assert all(isinstance(t, system.CoolerTarget) for t in target) + + +def test_temperature(): + for device in system.Device.get_all_devices(): + temperature = device.temperature + assert isinstance(temperature, system.Temperature) + + sensor = temperature.sensor() + assert isinstance(sensor, int) + assert sensor >= 0 + + for threshold in list(system.TemperatureThresholds)[:-1]: + try: + t = temperature.threshold(threshold) + except system.NotSupportedError: + continue + else: + assert isinstance(t, int) + assert t >= 0 + + try: + margin = temperature.margin + except system.NotSupportedError: + pass + else: + assert isinstance(margin, int) + assert margin >= 0 + + thermals = temperature.thermal_settings(system.ThermalTarget.ALL) + assert isinstance(thermals, system.ThermalSettings) + + for i, sensor in enumerate(thermals): + assert isinstance(sensor, system.ThermalSensor) + assert isinstance(sensor.target, system.ThermalTarget) + assert isinstance(sensor.controller, system.ThermalController) + assert isinstance(sensor.default_min_temp, int) + assert sensor.default_min_temp >= 0 + assert isinstance(sensor.default_max_temp, int) + assert sensor.default_max_temp >= sensor.default_min_temp + assert isinstance(sensor.current_temp, int) + assert sensor.default_min_temp <= sensor.current_temp <= sensor.default_max_temp + + +def test_pstates(): + for device in system.Device.get_all_devices(): + pstate = device.performance_state + assert isinstance(pstate, system.Pstates) + + pstates = device.get_supported_pstates() + assert all(isinstance(p, system.Pstates) for p in pstates) + + dynamic_pstates_info = device.dynamic_pstates_info + assert isinstance(dynamic_pstates_info, system.GpuDynamicPstatesInfo) + + assert len(dynamic_pstates_info) == nvml.MAX_GPU_UTILIZATIONS + + for utilization in dynamic_pstates_info: + assert isinstance(utilization.is_present, bool) + assert isinstance(utilization.percentage, int) + assert isinstance(utilization.inc_threshold, int) + assert isinstance(utilization.dec_threshold, int)