Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions cuda_core/cuda/core/_cpp/resource_handles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1337,6 +1337,77 @@ FileDescriptorHandle create_fd_handle_ref(int fd) {
#endif
}

// ============================================================================
// NVML event set function pointers and registration
// ============================================================================

NvmlEventSetFreeFn p_nvmlEventSetFree = nullptr;
NvmlSysEventSetFreeFn p_nvmlSysEventSetFree = nullptr;

void register_nvml_event_set_fn_pointers(intptr_t event_set_free_fn,
intptr_t sys_event_set_free_fn) noexcept {
p_nvmlEventSetFree = reinterpret_cast<NvmlEventSetFreeFn>(event_set_free_fn);
p_nvmlSysEventSetFree = reinterpret_cast<NvmlSysEventSetFreeFn>(sys_event_set_free_fn);
}

// ============================================================================
// NVML Event Set Handles (device-scope)
// ============================================================================

namespace {
struct NvmlEventSetBox {
NvmlEventSetValue resource;
};
} // namespace

NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle) {
if (!p_nvmlEventSetFree) {
return NvmlEventSetHandle{};
}
auto box = std::shared_ptr<NvmlEventSetBox>(
new NvmlEventSetBox{{handle}},
[](NvmlEventSetBox* b) {
if (p_nvmlEventSetFree && b->resource.raw) {
p_nvmlEventSetFree(reinterpret_cast<void*>(b->resource.raw));
}
delete b;
}
);
return NvmlEventSetHandle(box, &box->resource);
}

// ============================================================================
// NVML System Event Set Handles (system-scope)
// ============================================================================

namespace {
struct NvmlSysEventSetBox {
NvmlSysEventSetValue resource;
};
} // namespace

NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle) {
if (!p_nvmlSysEventSetFree) {
return NvmlSysEventSetHandle{};
}
auto box = std::shared_ptr<NvmlSysEventSetBox>(
new NvmlSysEventSetBox{{handle}},
[](NvmlSysEventSetBox* b) {
if (p_nvmlSysEventSetFree && b->resource.raw) {
// Matches NVML_STRUCT_VERSION(SystemEventSetFreeRequest, 1):
// version = sizeof(struct) | (1 << 24). Both our struct and the
// NVML header struct have the same layout ({unsigned int, void*}).
NvmlSysEventSetFreeRequest req;
req.set = reinterpret_cast<void*>(b->resource.raw);
req.version = (unsigned int)(sizeof(NvmlSysEventSetFreeRequest) | (1u << 24u));
p_nvmlSysEventSetFree(&req);
}
delete b;
}
);
return NvmlSysEventSetHandle(box, &box->resource);
}

// ============================================================================
// SM resource split wrapper
// ============================================================================
Expand Down
62 changes: 62 additions & 0 deletions cuda_core/cuda/core/_cpp/resource_handles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,13 @@ struct TaggedHandle {
using NvvmProgramValue = TaggedHandle<nvvmProgram, 0>;
using NvJitLinkValue = TaggedHandle<nvJitLink_t, 1>;

// NVML event set types — forward-declared as void* to avoid nvml.h dependency.
// nvmlEventSet_t = nvmlEventSet_st* (device-scope event set)
// nvmlSystemEventSet_t = nvmlSystemEventSet_st* (system-scope event set)
// TaggedHandle distinguishes the two intptr_t-based handle types for overloading.
using NvmlEventSetValue = TaggedHandle<intptr_t, 2>;
using NvmlSysEventSetValue = TaggedHandle<intptr_t, 3>;

// ============================================================================
// Thread-local error handling
// ============================================================================
Expand Down Expand Up @@ -152,6 +159,35 @@ extern NvvmDestroyProgramFn p_nvvmDestroyProgram;
using NvJitLinkDestroyFn = int (*)(nvJitLink_t*);
extern NvJitLinkDestroyFn p_nvJitLinkDestroy;

// ============================================================================
// NVML event set function pointers
//
// Populated by register_nvml_event_set_fn_pointers(), called from the system
// event / device modules once the NVML bindings have loaded the library.
// Both may be null until registration; deleters are no-ops when null.
// ============================================================================

// nvmlReturn_t nvmlEventSetFree(nvmlEventSet_t set)
// nvmlEventSet_t is nvmlEventSet_st* (opaque pointer stored as intptr_t here)
using NvmlEventSetFreeFn = unsigned int (*)(void*);
extern NvmlEventSetFreeFn p_nvmlEventSetFree;

// Minimal layout-compatible counterpart to nvmlSystemEventSetFreeRequest_v1_t.
// Both fields match the NVML header: {unsigned int version; void* set;}.
struct NvmlSysEventSetFreeRequest {
unsigned int version;
void* set; // nvmlSystemEventSet_t
};

// nvmlReturn_t nvmlSystemEventSetFree(nvmlSystemEventSetFreeRequest_t*)
using NvmlSysEventSetFreeFn = unsigned int (*)(NvmlSysEventSetFreeRequest*);
extern NvmlSysEventSetFreeFn p_nvmlSysEventSetFree;

// Register both NVML event-set free function pointers.
// safe to call multiple times (idempotent); second call is a no-op.
void register_nvml_event_set_fn_pointers(intptr_t event_set_free_fn,
intptr_t sys_event_set_free_fn) noexcept;

// ============================================================================
// Handle type aliases - expose only the raw CUDA resource
// ============================================================================
Expand All @@ -171,6 +207,24 @@ using NvvmProgramHandle = std::shared_ptr<const NvvmProgramValue>;
using NvJitLinkHandle = std::shared_ptr<const NvJitLinkValue>;
using CuLinkHandle = std::shared_ptr<const CUlinkState>;
using FileDescriptorHandle = std::shared_ptr<const int>;
using NvmlEventSetHandle = std::shared_ptr<const NvmlEventSetValue>;
using NvmlSysEventSetHandle = std::shared_ptr<const NvmlSysEventSetValue>;

// ============================================================================
// NVML event set handle functions
// ============================================================================

// Create an owning device-scope NVML event set handle.
// handle is the intptr_t value returned by nvml.event_set_create().
// When the last reference is released, nvmlEventSetFree is called.
// Returns empty handle if registration has not been done (p_nvmlEventSetFree is null).
NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle);

// Create an owning system-scope NVML event set handle.
// handle is the intptr_t value returned by nvml.system_event_set_create().
// When the last reference is released, nvmlSystemEventSetFree is called via struct.
// Returns empty handle if registration has not been done.
NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle);


// ============================================================================
Expand Down Expand Up @@ -661,6 +715,14 @@ inline std::intptr_t as_intptr(const FileDescriptorHandle& h) noexcept {
return h ? static_cast<std::intptr_t>(*h) : -1;
}

inline std::intptr_t as_intptr(const NvmlEventSetHandle& h) noexcept {
return h ? h->raw : 0;
}

inline std::intptr_t as_intptr(const NvmlSysEventSetHandle& h) noexcept {
return h ? h->raw : 0;
}

// as_py() - convert handle to Python wrapper object (returns new reference)
#if PY_VERSION_HEX < 0x030D0000
extern "C" int _Py_IsFinalizing(void);
Expand Down
17 changes: 17 additions & 0 deletions cuda_core/cuda/core/_resource_handles.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
ctypedef shared_ptr[const NvvmProgramValue] NvvmProgramHandle
ctypedef shared_ptr[const NvJitLinkValue] NvJitLinkHandle

# NvmlEventSetValue and NvmlSysEventSetValue are TaggedHandle<intptr_t, Tag>
# instantiations to distinguish the two NVML event set handle types.
cppclass NvmlEventSetValue "cuda_core::NvmlEventSetValue":
pass
cppclass NvmlSysEventSetValue "cuda_core::NvmlSysEventSetValue":
pass
ctypedef shared_ptr[const NvmlEventSetValue] NvmlEventSetHandle
ctypedef shared_ptr[const NvmlSysEventSetValue] NvmlSysEventSetHandle

ctypedef shared_ptr[const cydriver.CUlinkState] CuLinkHandle
ctypedef shared_ptr[const int] FileDescriptorHandle

Expand Down Expand Up @@ -78,6 +87,8 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
intptr_t as_intptr(NvJitLinkHandle h) noexcept nogil
intptr_t as_intptr(CuLinkHandle h) noexcept nogil
intptr_t as_intptr(FileDescriptorHandle h) noexcept nogil
intptr_t as_intptr(NvmlEventSetHandle h) noexcept nogil
intptr_t as_intptr(NvmlSysEventSetHandle h) noexcept nogil

# as_py() - convert handle to Python wrapper object (inline C++; requires GIL)
object as_py(ContextHandle h)
Expand Down Expand Up @@ -224,6 +235,12 @@ cdef CuLinkHandle create_culink_handle_ref(cydriver.CUlinkState state) except+ n
cdef FileDescriptorHandle create_fd_handle(int fd) except+ nogil
cdef FileDescriptorHandle create_fd_handle_ref(int fd) except+ nogil

# NVML event set handles
cdef void register_nvml_event_set_fn_pointers(
intptr_t event_set_free_fn, intptr_t sys_event_set_free_fn) noexcept
cdef NvmlEventSetHandle create_nvml_event_set_handle(intptr_t handle) noexcept nogil
cdef NvmlSysEventSetHandle create_nvml_sys_event_set_handle(intptr_t handle) noexcept nogil

# SM resource split (13.1+ — calls through function pointer, safe on older bindings)
# groupParams is void* here to avoid referencing CU_DEV_SM_RESOURCE_GROUP_PARAMS
# (which doesn't exist in cuda-bindings 13.0 .pxd). The C++ side casts it.
Expand Down
2 changes: 2 additions & 0 deletions cuda_core/cuda/core/_resource_handles.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,7 @@ GraphicsResourceHandle = shared_ptr
NvrtcProgramHandle = shared_ptr
NvvmProgramHandle = shared_ptr
NvJitLinkHandle = shared_ptr
NvmlEventSetHandle = shared_ptr
NvmlSysEventSetHandle = shared_ptr
CuLinkHandle = shared_ptr
FileDescriptorHandle = shared_ptr
8 changes: 8 additions & 0 deletions cuda_core/cuda/core/_resource_handles.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,14 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core":
FileDescriptorHandle create_fd_handle_ref "cuda_core::create_fd_handle_ref" (
int fd) except+ nogil

# NVML event set handles
void register_nvml_event_set_fn_pointers "cuda_core::register_nvml_event_set_fn_pointers" (
intptr_t event_set_free_fn, intptr_t sys_event_set_free_fn) noexcept
NvmlEventSetHandle create_nvml_event_set_handle "cuda_core::create_nvml_event_set_handle" (
intptr_t handle) noexcept nogil
NvmlSysEventSetHandle create_nvml_sys_event_set_handle "cuda_core::create_nvml_sys_event_set_handle" (
intptr_t handle) noexcept nogil

# SM resource split (13.1+ wrapper — avoids direct cydriver cimport)
# groupParams is void* to avoid referencing CU_DEV_SM_RESOURCE_GROUP_PARAMS
# (which doesn't exist in cuda-bindings 13.0 .pxd). The C++ side casts it.
Expand Down
6 changes: 3 additions & 3 deletions cuda_core/cuda/core/system/_device.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,10 @@ class DeviceEvents:
Represents a set of events that can be waited on for a specific device.
"""

def __init__(self, device_handle: int, events: EventType | str | list[EventType | str]):
...
def close(self):
"""Destroy the device event set, releasing its NVML resources."""

def __dealloc__(self) -> None:
def __init__(self, device_handle: int, events: EventType | str | list[EventType | str]):
...

def wait(self, timeout_ms: int=0) -> EventData:
Expand Down
25 changes: 25 additions & 0 deletions cuda_core/cuda/core/system/_device.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@ import warnings

from cuda.bindings import nvml

from cuda.core._resource_handles cimport (
NvmlEventSetHandle,
as_intptr,
create_nvml_event_set_handle,
register_nvml_event_set_fn_pointers,
)

from ._nvml_context cimport initialize
from cuda.core.system.typing import (
AddressingMode,
Expand Down Expand Up @@ -53,6 +60,24 @@ cdef int _pstate_to_enum(int pstate):
return int(pstate) + int(nvml.Pstates.PSTATE_0)


cdef void _register_nvml_fn_pointers() noexcept:
# Register NVML event-set free function pointers so that NvmlEventSetHandle
# and NvmlSysEventSetHandle deleters can call them without GIL.
# Function pointers come from the NVML internal bindings (loaded via dlsym
# at their module import time) and are safe to read immediately.
try:
from cuda.bindings._internal import nvml as _nvml_internal
except ImportError:
return
fn_ptrs = _nvml_internal._inspect_function_pointers()
cdef intptr_t p_event_set_free = fn_ptrs.get("__nvmlEventSetFree", 0)
cdef intptr_t p_sys_event_set_free = fn_ptrs.get("__nvmlSystemEventSetFree", 0)
register_nvml_event_set_fn_pointers(p_event_set_free, p_sys_event_set_free)


_register_nvml_fn_pointers()


include "_clock.pxi"
include "_cooler.pxi"
include "_device_attributes.pxi"
Expand Down
21 changes: 10 additions & 11 deletions cuda_core/cuda/core/system/_event.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,10 @@ cdef class DeviceEvents:
"""
Represents a set of events that can be waited on for a specific device.
"""
cdef intptr_t _event_set
cdef NvmlEventSetHandle _h_event_set
cdef intptr_t _device_handle

def __init__(self, device_handle: intptr_t, events: EventType | str | list[EventType | str]):
self._event_set = 0

cdef unsigned long long event_bitmask
if isinstance(events, (str, EventType)):
events = [events]
Expand All @@ -116,14 +114,15 @@ cdef class DeviceEvents:
raise TypeError("events must be an EventType, str, or list of EventType or str")

self._device_handle = device_handle
self._event_set = nvml.event_set_create()
# If this raises, the event needs to be freed and this is handled by
# this class's __dealloc__ method.
nvml.device_register_events(self._device_handle, event_bitmask, self._event_set)
cdef intptr_t raw_set = nvml.event_set_create()
# If device_register_events raises, create_nvml_event_set_handle already
# owns the handle and its shared_ptr deleter will free it.
self._h_event_set = create_nvml_event_set_handle(raw_set)
nvml.device_register_events(self._device_handle, event_bitmask, raw_set)

def __dealloc__(self) -> None:
if self._event_set != 0:
nvml.event_set_free(self._event_set)
cpdef close(self):
"""Destroy the device event set, releasing its NVML resources."""
self._h_event_set.reset()

def wait(self, timeout_ms: int = 0) -> EventData:
"""
Expand Down Expand Up @@ -167,4 +166,4 @@ cdef class DeviceEvents:
:class:`cuda.core.system.GpuIsLostError`
If the GPU has fallen off the bus or is otherwise inaccessible.
"""
return EventData(nvml.event_set_wait_v2(self._event_set, timeout_ms))
return EventData(nvml.event_set_wait_v2(as_intptr(self._h_event_set), timeout_ms))
6 changes: 3 additions & 3 deletions cuda_core/cuda/core/system/_system_events.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ class RegisteredSystemEvents:
Represents a set of events that can be waited on for a specific device.
"""

def __init__(self, events: SystemEventType | str | list[SystemEventType | str]):
...
def close(self):
"""Destroy the system event set, releasing its NVML resources."""

def __dealloc__(self) -> None:
def __init__(self, events: SystemEventType | str | list[SystemEventType | str]):
...

def wait(self, timeout_ms: int=0, buffer_size: int=1) -> SystemEvents:
Expand Down
Loading
Loading