diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp index c17b2830de..714a84b6ec 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.cpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.cpp @@ -77,8 +77,6 @@ NvJitLinkDestroyFn p_nvJitLinkDestroy = nullptr; namespace { -using cuda_core::detail::py_is_finalizing; - // Helper to release the GIL while calling into the CUDA driver. // This guard is *conditional*: if the caller already dropped the GIL, // we avoid calling PyEval_SaveThread (which requires holding the GIL). @@ -148,6 +146,51 @@ class GILAcquireGuard { } // namespace +// ============================================================================ +// Handle reverse-lookup registry +// +// Maps raw CUDA handles (CUevent, CUkernel, etc.) back to their owning +// shared_ptr so that _ref constructors can recover full metadata. +// Uses weak_ptr to avoid preventing destruction. +// ============================================================================ + +template> +class HandleRegistry { +public: + using MapType = std::unordered_map, Hash>; + + void register_handle(const Key& key, const Handle& h) { + std::lock_guard lock(mutex_); + map_[key] = h; + } + + void unregister_handle(const Key& key) noexcept { + try { + std::lock_guard lock(mutex_); + auto it = map_.find(key); + if (it != map_.end() && it->second.expired()) { + map_.erase(it); + } + } catch (...) {} + } + + Handle lookup(const Key& key) { + std::lock_guard lock(mutex_); + auto it = map_.find(key); + if (it != map_.end()) { + if (auto h = it->second.lock()) { + return h; + } + map_.erase(it); + } + return {}; + } + +private: + std::mutex mutex_; + MapType map_; +}; + // ============================================================================ // Thread-local error handling // ============================================================================ @@ -306,10 +349,46 @@ StreamHandle get_per_thread_stream() { namespace { struct EventBox { CUevent resource; + bool timing_disabled; + bool busy_waited; + bool ipc_enabled; + int device_id; + ContextHandle h_context; }; } // namespace -EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) { +static const EventBox* get_box(const EventHandle& h) { + const CUevent* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(EventBox, resource) + ); +} + +bool get_event_timing_disabled(const EventHandle& h) noexcept { + return h ? get_box(h)->timing_disabled : true; +} + +bool get_event_busy_waited(const EventHandle& h) noexcept { + return h ? get_box(h)->busy_waited : false; +} + +bool get_event_ipc_enabled(const EventHandle& h) noexcept { + return h ? get_box(h)->ipc_enabled : false; +} + +int get_event_device_id(const EventHandle& h) noexcept { + return h ? get_box(h)->device_id : -1; +} + +ContextHandle get_event_context(const EventHandle& h) noexcept { + return h ? get_box(h)->h_context : ContextHandle{}; +} + +static HandleRegistry event_registry; + +EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, + bool timing_disabled, bool busy_waited, + bool ipc_enabled, int device_id) { GILReleaseGuard gil; CUevent event; if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) { @@ -317,21 +396,33 @@ EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) } auto box = std::shared_ptr( - new EventBox{event}, + new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx}, [h_ctx](const EventBox* b) { + event_registry.unregister_handle(b->resource); GILReleaseGuard gil; p_cuEventDestroy(b->resource); delete b; } ); - return EventHandle(box, &box->resource); + EventHandle h(box, &box->resource); + event_registry.register_handle(event, h); + return h; } EventHandle create_event_handle_noctx(unsigned int flags) { - return create_event_handle(ContextHandle{}, flags); + return create_event_handle(ContextHandle{}, flags, true, false, false, -1); } -EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { +EventHandle create_event_handle_ref(CUevent event) { + if (auto h = event_registry.lookup(event)) { + return h; + } + auto box = std::make_shared(EventBox{event, true, false, false, -1, {}}); + return EventHandle(box, &box->resource); +} + +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, + bool busy_waited) { GILReleaseGuard gil; CUevent event; if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) { @@ -339,14 +430,17 @@ EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) { } auto box = std::shared_ptr( - new EventBox{event}, + new EventBox{event, true, busy_waited, true, -1, {}}, [](const EventBox* b) { + event_registry.unregister_handle(b->resource); GILReleaseGuard gil; p_cuEventDestroy(b->resource); delete b; } ); - return EventHandle(box, &box->resource); + EventHandle h(box, &box->resource); + event_registry.register_handle(event, h); + return h; } // ============================================================================ @@ -653,61 +747,43 @@ struct ExportDataKeyHash { } -static std::mutex ipc_ptr_cache_mutex; -static std::unordered_map, ExportDataKeyHash> ipc_ptr_cache; +static HandleRegistry ipc_ptr_cache; +static std::mutex ipc_import_mutex; DevicePtrHandle deviceptr_import_ipc(const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) { auto data = const_cast( reinterpret_cast(export_data)); if (use_ipc_ptr_cache()) { - // Check cache before calling cuMemPoolImportPointer ExportDataKey key; std::memcpy(&key.data, data, sizeof(key.data)); - std::lock_guard lock(ipc_ptr_cache_mutex); + std::lock_guard lock(ipc_import_mutex); - auto it = ipc_ptr_cache.find(key); - if (it != ipc_ptr_cache.end()) { - if (auto box = it->second.lock()) { - // Cache hit - return existing handle - return DevicePtrHandle(box, &box->resource); - } - ipc_ptr_cache.erase(it); // Expired entry + if (auto h = ipc_ptr_cache.lookup(key)) { + return h; } - // Cache miss - import the pointer GILReleaseGuard gil; CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { return {}; } - // Create new handle with cache-clearing deleter auto box = std::shared_ptr( new DevicePtrBox{ptr, h_stream}, [h_pool, key](DevicePtrBox* b) { + ipc_ptr_cache.unregister_handle(key); GILReleaseGuard gil; - try { - std::lock_guard lock(ipc_ptr_cache_mutex); - // Only erase if expired - avoids race where another thread - // replaced the entry with a new import before we acquired the lock. - auto it = ipc_ptr_cache.find(key); - if (it != ipc_ptr_cache.end() && it->second.expired()) { - ipc_ptr_cache.erase(it); - } - } catch (...) { - // Cache cleanup is best-effort - swallow exceptions in destructor context - } p_cuMemFreeAsync(b->resource, as_cu(b->h_stream)); delete b; } ); - ipc_ptr_cache[key] = box; - return DevicePtrHandle(box, &box->resource); + DevicePtrHandle h(box, &box->resource); + ipc_ptr_cache.register_handle(key, h); + return h; } else { - // No caching - simple handle creation GILReleaseGuard gil; CUdeviceptr ptr; if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) { @@ -786,10 +862,19 @@ LibraryHandle create_library_handle_ref(CUlibrary library) { namespace { struct KernelBox { CUkernel resource; - LibraryHandle h_library; // Keeps library alive + LibraryHandle h_library; }; } // namespace +static const KernelBox* get_box(const KernelHandle& h) { + const CUkernel* p = h.get(); + return reinterpret_cast( + reinterpret_cast(p) - offsetof(KernelBox, resource) + ); +} + +static HandleRegistry kernel_registry; + KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) { GILReleaseGuard gil; CUkernel kernel; @@ -797,14 +882,25 @@ KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* na return {}; } - return create_kernel_handle_ref(kernel, h_library); + auto box = std::make_shared(KernelBox{kernel, h_library}); + KernelHandle h(box, &box->resource); + kernel_registry.register_handle(kernel, h); + return h; } -KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library) { - auto box = std::make_shared(KernelBox{kernel, h_library}); +KernelHandle create_kernel_handle_ref(CUkernel kernel) { + if (auto h = kernel_registry.lookup(kernel)) { + return h; + } + auto box = std::make_shared(KernelBox{kernel, {}}); return KernelHandle(box, &box->resource); } +LibraryHandle get_kernel_library(const KernelHandle& h) noexcept { + if (!h) return {}; + return get_box(h)->h_library; +} + // ============================================================================ // Graphics Resource Handles // ============================================================================ diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp index 2f9c19fe15..28b2d8d1c8 100644 --- a/cuda_core/cuda/core/_cpp/resource_handles.hpp +++ b/cuda_core/cuda/core/_cpp/resource_handles.hpp @@ -200,9 +200,12 @@ StreamHandle get_per_thread_stream(); // Create an owning event handle by calling cuEventCreate. // The event structurally depends on the provided context handle. +// Metadata fields are stored in the EventBox for later retrieval. // When the last reference is released, cuEventDestroy is called automatically. // Returns empty handle on error (caller must check). -EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags); +EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags, + bool timing_disabled, bool busy_waited, + bool ipc_enabled, int device_id); // Create an owning event handle without context dependency. // Use for temporary events that are created and destroyed in the same scope. @@ -214,7 +217,21 @@ EventHandle create_event_handle_noctx(unsigned int flags); // The originating process owns the event and its context. // When the last reference is released, cuEventDestroy is called automatically. // Returns empty handle on error (caller must check). -EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle); +EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle, + bool busy_waited); + +// Create a non-owning event handle (references existing event). +// Use for events that are managed by the CUDA graph or another owner. +// The event will NOT be destroyed when the handle is released. +// Metadata defaults to unknown (timing_disabled=true, device_id=-1). +EventHandle create_event_handle_ref(CUevent event); + +// Event metadata accessors (read from EventBox via pointer arithmetic) +bool get_event_timing_disabled(const EventHandle& h) noexcept; +bool get_event_busy_waited(const EventHandle& h) noexcept; +bool get_event_ipc_enabled(const EventHandle& h) noexcept; +int get_event_device_id(const EventHandle& h) noexcept; +ContextHandle get_event_context(const EventHandle& h) noexcept; // ============================================================================ // Memory pool handle functions @@ -345,9 +362,14 @@ LibraryHandle create_library_handle_ref(CUlibrary library); // Returns empty handle on error (caller must check). KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name); -// Create a non-owning kernel handle with library dependency. -// Use for borrowed kernels. The library handle keeps the library alive. -KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library); +// Create a kernel handle from a raw CUkernel. +// If the kernel is already managed (in the registry), returns the owning +// handle with library dependency. Otherwise returns a non-owning ref. +KernelHandle create_kernel_handle_ref(CUkernel kernel); + +// Get the library handle associated with a kernel (from KernelBox). +// Returns empty handle if the kernel has no library dependency. +LibraryHandle get_kernel_library(const KernelHandle& h) noexcept; // ============================================================================ // Graphics resource handle functions @@ -516,8 +538,6 @@ inline std::intptr_t as_intptr(const CuLinkHandle& h) noexcept { } // as_py() - convert handle to Python wrapper object (returns new reference) -namespace detail { - #if PY_VERSION_HEX < 0x030D0000 extern "C" int _Py_IsFinalizing(void); #endif @@ -530,6 +550,7 @@ inline bool py_is_finalizing() noexcept { #endif } +namespace detail { // n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md inline PyObject* make_py(const char* module_name, const char* class_name, std::intptr_t value) noexcept { if (py_is_finalizing()) { diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd index c393b29ebf..5710b13699 100644 --- a/cuda_core/cuda/core/_event.pxd +++ b/cuda_core/cuda/core/_event.pxd @@ -10,15 +10,13 @@ cdef class Event: cdef: EventHandle _h_event - ContextHandle _h_context - bint _timing_disabled - bint _busy_waited - bint _ipc_enabled object _ipc_descriptor - int _device_id object __weakref__ @staticmethod cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free) + @staticmethod + cdef Event _from_handle(EventHandle h_event) + cpdef close(self) diff --git a/cuda_core/cuda/core/_event.pyx b/cuda_core/cuda/core/_event.pyx index 1ff87a1ea0..4a0491d865 100644 --- a/cuda_core/cuda/core/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -13,6 +13,11 @@ from cuda.core._resource_handles cimport ( EventHandle, create_event_handle, create_event_handle_ipc, + get_event_timing_disabled, + get_event_busy_waited, + get_event_ipc_enabled, + get_event_device_id, + get_event_context, as_intptr, as_cu, as_py, @@ -95,36 +100,46 @@ cdef class Event: cdef Event self = cls.__new__(cls) cdef EventOptions opts = check_or_create_options(EventOptions, options, "Event options") cdef unsigned int flags = 0x0 - self._timing_disabled = False - self._busy_waited = False - self._ipc_enabled = False + cdef bint timing_disabled = False + cdef bint busy_waited = False + cdef bint ipc_enabled = False self._ipc_descriptor = None if not opts.enable_timing: flags |= cydriver.CUevent_flags.CU_EVENT_DISABLE_TIMING - self._timing_disabled = True + timing_disabled = True if opts.busy_waited_sync: flags |= cydriver.CUevent_flags.CU_EVENT_BLOCKING_SYNC - self._busy_waited = True + busy_waited = True if opts.ipc_enabled: if is_free: raise TypeError( "IPC-enabled events must be bound; use Stream.record for creation." ) flags |= cydriver.CUevent_flags.CU_EVENT_INTERPROCESS - self._ipc_enabled = True - if not self._timing_disabled: + ipc_enabled = True + if not timing_disabled: raise TypeError("IPC-enabled events cannot use timing.") - # C++ creates the event and returns owning handle with context dependency - cdef EventHandle h_event = create_event_handle(h_context, flags) + cdef EventHandle h_event = create_event_handle( + h_context, flags, timing_disabled, busy_waited, ipc_enabled, device_id) if not h_event: raise RuntimeError("Failed to create CUDA event") self._h_event = h_event - self._h_context = h_context - self._device_id = device_id - if opts.ipc_enabled: + if ipc_enabled: self.get_ipc_descriptor() return self + @staticmethod + cdef Event _from_handle(EventHandle h_event): + """Create an Event wrapping an existing EventHandle. + + Metadata (timing, busy_waited, ipc, device_id) is read from the + EventBox via pointer arithmetic — no fields are cached on Event. + """ + cdef Event self = Event.__new__(Event) + self._h_event = h_event + self._ipc_descriptor = None + return self + cpdef close(self): """Destroy the event. @@ -191,7 +206,7 @@ cdef class Event: with nogil: HANDLE_RETURN(cydriver.cuIpcGetEventHandle(&data, as_cu(self._h_event))) cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) - self._ipc_descriptor = IPCEventDescriptor._init(data_b, self._busy_waited) + self._ipc_descriptor = IPCEventDescriptor._init(data_b, get_event_busy_waited(self._h_event)) return self._ipc_descriptor @classmethod @@ -200,33 +215,27 @@ cdef class Event: cdef cydriver.CUipcEventHandle data memcpy(data.reserved, (ipc_descriptor._reserved), sizeof(data.reserved)) cdef Event self = Event.__new__(cls) - # IPC events: the originating process owns the event and its context - cdef EventHandle h_event = create_event_handle_ipc(data) + cdef EventHandle h_event = create_event_handle_ipc(data, ipc_descriptor._busy_waited) if not h_event: raise RuntimeError("Failed to open IPC event handle") self._h_event = h_event - self._h_context = ContextHandle() - self._timing_disabled = True - self._busy_waited = ipc_descriptor._busy_waited - self._ipc_enabled = True self._ipc_descriptor = ipc_descriptor - self._device_id = -1 return self @property def is_ipc_enabled(self) -> bool: """Return True if the event can be shared across process boundaries, otherwise False.""" - return self._ipc_enabled + return get_event_ipc_enabled(self._h_event) @property def is_timing_disabled(self) -> bool: """Return True if the event does not record timing data, otherwise False.""" - return self._timing_disabled + return get_event_timing_disabled(self._h_event) @property def is_sync_busy_waited(self) -> bool: """Return True if the event synchronization would keep the CPU busy-waiting, otherwise False.""" - return self._busy_waited + return get_event_busy_waited(self._h_event) def sync(self): """Synchronize until the event completes. @@ -274,15 +283,18 @@ cdef class Event: context is set current after a event is created. """ - if self._device_id >= 0: + cdef int dev_id = get_event_device_id(self._h_event) + if dev_id >= 0: from ._device import Device # avoid circular import - return Device(self._device_id) + return Device(dev_id) @property def context(self) -> Context: """Return the :obj:`~_context.Context` associated with this event.""" - if self._h_context and self._device_id >= 0: - return Context._from_handle(Context, self._h_context, self._device_id) + cdef ContextHandle h_ctx = get_event_context(self._h_event) + cdef int dev_id = get_event_device_id(self._h_event) + if h_ctx and dev_id >= 0: + return Context._from_handle(Context, h_ctx, dev_id) cdef class IPCEventDescriptor: diff --git a/cuda_core/cuda/core/_graph.py b/cuda_core/cuda/core/_graph/__init__.py similarity index 100% rename from cuda_core/cuda/core/_graph.py rename to cuda_core/cuda/core/_graph/__init__.py diff --git a/cuda_core/cuda/core/_module.pxd b/cuda_core/cuda/core/_module.pxd index 9468de3dff..1d3a0772c3 100644 --- a/cuda_core/cuda/core/_module.pxd +++ b/cuda_core/cuda/core/_module.pxd @@ -16,10 +16,11 @@ cdef class Kernel: KernelHandle _h_kernel KernelAttributes _attributes # lazy KernelOccupancy _occupancy # lazy + object _keepalive object __weakref__ @staticmethod - cdef Kernel _from_obj(KernelHandle h_kernel) + cdef Kernel _from_handle(KernelHandle h_kernel) cdef tuple _get_arguments_info(self, bint param_info=*) diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index ca5562f990..4e8f810619 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -19,9 +19,9 @@ from cuda.core._resource_handles cimport ( KernelHandle, create_library_handle_from_file, create_library_handle_from_data, - create_library_handle_ref, create_kernel_handle, create_kernel_handle_ref, + get_kernel_library, get_last_error, as_cu, as_py, @@ -493,7 +493,7 @@ cdef class Kernel: raise RuntimeError("Kernel objects cannot be instantiated directly. Please use ObjectCode APIs.") @staticmethod - cdef Kernel _from_obj(KernelHandle h_kernel): + cdef Kernel _from_handle(KernelHandle h_kernel): cdef Kernel ker = Kernel.__new__(Kernel) ker._h_kernel = h_kernel ker._attributes = None @@ -567,9 +567,7 @@ cdef class Kernel: @staticmethod def from_handle(handle, mod: ObjectCode = None) -> Kernel: - """Creates a new :obj:`Kernel` object from a foreign kernel handle. - - Uses a CUkernel pointer address to create a new :obj:`Kernel` object. + """Creates a new :obj:`Kernel` object from a kernel handle. Parameters ---------- @@ -577,37 +575,37 @@ cdef class Kernel: Kernel handle representing the address of a foreign kernel object (CUkernel). mod : :obj:`ObjectCode`, optional - The ObjectCode object associated with this kernel. If not provided, - a placeholder ObjectCode will be created. Note that without a proper - ObjectCode, certain operations may be limited. + The ObjectCode object associated with this kernel. Provides + library lifetime for foreign kernels not created by + cuda.core. """ - # Validate that handle is an integer if not isinstance(handle, int): raise TypeError(f"handle must be an integer, got {type(handle).__name__}") - # Convert the integer handle to CUkernel cdef cydriver.CUkernel cu_kernel = handle - cdef KernelHandle h_kernel - cdef cydriver.CUlibrary cu_library - cdef cydriver.CUresult err - - # If no module provided, create a placeholder and try to get the library - if mod is None: - mod = ObjectCode._init(b"", "cubin") - if _is_cukernel_get_library_supported(): - # Try to get the owning library via cuKernelGetLibrary - with nogil: - err = cydriver.cuKernelGetLibrary(&cu_library, cu_kernel) - if err == cydriver.CUDA_SUCCESS: - mod._h_library = create_library_handle_ref(cu_library) - - # Create kernel handle with library dependency - h_kernel = create_kernel_handle_ref(cu_kernel, mod._h_library) + cdef KernelHandle h_kernel = create_kernel_handle_ref(cu_kernel) if not h_kernel: HANDLE_RETURN(get_last_error()) - return Kernel._from_obj(h_kernel) + cdef LibraryHandle h_existing_lib = get_kernel_library(h_kernel) + cdef LibraryHandle h_caller_lib + + if mod is not None: + h_caller_lib = (mod)._h_library + if h_existing_lib and h_caller_lib: + if as_cu(h_existing_lib) != as_cu(h_caller_lib): + import warnings + warnings.warn( + "The library from the provided ObjectCode does not match " + "the library associated with this kernel.", + stacklevel=2, + ) + + cdef Kernel k = Kernel._from_handle(h_kernel) + if mod is not None and not h_existing_lib: + k._keepalive = mod + return k def __eq__(self, other) -> bool: if not isinstance(other, Kernel): @@ -825,7 +823,7 @@ cdef class ObjectCode: cdef KernelHandle h_kernel = create_kernel_handle(self._h_library, name) if not h_kernel: HANDLE_RETURN(get_last_error()) - return Kernel._from_obj(h_kernel) + return Kernel._from_handle(h_kernel) @property def code(self) -> CodeTypeT: diff --git a/cuda_core/cuda/core/_resource_handles.pxd b/cuda_core/cuda/core/_resource_handles.pxd index c5a1ab36a6..00fe4ec800 100644 --- a/cuda_core/cuda/core/_resource_handles.pxd +++ b/cuda_core/cuda/core/_resource_handles.pxd @@ -108,10 +108,21 @@ cdef StreamHandle get_legacy_stream() except+ nogil cdef StreamHandle get_per_thread_stream() except+ nogil # Event handles -cdef EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) except+ nogil +cdef EventHandle create_event_handle( + const ContextHandle& h_ctx, unsigned int flags, + bint timing_disabled, bint busy_waited, + bint ipc_enabled, int device_id) except+ nogil cdef EventHandle create_event_handle_noctx(unsigned int flags) except+ nogil +cdef EventHandle create_event_handle_ref(cydriver.CUevent event) except+ nogil cdef EventHandle create_event_handle_ipc( - const cydriver.CUipcEventHandle& ipc_handle) except+ nogil + const cydriver.CUipcEventHandle& ipc_handle, bint busy_waited) except+ nogil + +# Event metadata getters +cdef bint get_event_timing_disabled(const EventHandle& h) noexcept nogil +cdef bint get_event_busy_waited(const EventHandle& h) noexcept nogil +cdef bint get_event_ipc_enabled(const EventHandle& h) noexcept nogil +cdef int get_event_device_id(const EventHandle& h) noexcept nogil +cdef ContextHandle get_event_context(const EventHandle& h) noexcept nogil # Memory pool handles cdef MemoryPoolHandle create_mempool_handle( @@ -150,8 +161,8 @@ cdef LibraryHandle create_library_handle_ref(cydriver.CUlibrary library) except+ # Kernel handles cdef KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) except+ nogil -cdef KernelHandle create_kernel_handle_ref( - cydriver.CUkernel kernel, const LibraryHandle& h_library) except+ nogil +cdef KernelHandle create_kernel_handle_ref(cydriver.CUkernel kernel) except+ nogil +cdef LibraryHandle get_kernel_library(const KernelHandle& h) noexcept nogil # Graphics resource handles cdef GraphicsResourceHandle create_graphics_resource_handle( diff --git a/cuda_core/cuda/core/_resource_handles.pyx b/cuda_core/cuda/core/_resource_handles.pyx index eebaed2e28..1e7facbea5 100644 --- a/cuda_core/cuda/core/_resource_handles.pyx +++ b/cuda_core/cuda/core/_resource_handles.pyx @@ -70,11 +70,27 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": # Event handles (note: _create_event_handle* are internal due to C++ overloading) EventHandle create_event_handle "cuda_core::create_event_handle" ( - const ContextHandle& h_ctx, unsigned int flags) except+ nogil + const ContextHandle& h_ctx, unsigned int flags, + bint timing_disabled, bint busy_waited, + bint ipc_enabled, int device_id) except+ nogil EventHandle create_event_handle_noctx "cuda_core::create_event_handle_noctx" ( unsigned int flags) except+ nogil + EventHandle create_event_handle_ref "cuda_core::create_event_handle_ref" ( + cydriver.CUevent event) except+ nogil EventHandle create_event_handle_ipc "cuda_core::create_event_handle_ipc" ( - const cydriver.CUipcEventHandle& ipc_handle) except+ nogil + const cydriver.CUipcEventHandle& ipc_handle, bint busy_waited) except+ nogil + + # Event metadata getters + bint get_event_timing_disabled "cuda_core::get_event_timing_disabled" ( + const EventHandle& h) noexcept nogil + bint get_event_busy_waited "cuda_core::get_event_busy_waited" ( + const EventHandle& h) noexcept nogil + bint get_event_ipc_enabled "cuda_core::get_event_ipc_enabled" ( + const EventHandle& h) noexcept nogil + int get_event_device_id "cuda_core::get_event_device_id" ( + const EventHandle& h) noexcept nogil + ContextHandle get_event_context "cuda_core::get_event_context" ( + const EventHandle& h) noexcept nogil # Memory pool handles MemoryPoolHandle create_mempool_handle "cuda_core::create_mempool_handle" ( @@ -126,7 +142,9 @@ cdef extern from "_cpp/resource_handles.hpp" namespace "cuda_core": KernelHandle create_kernel_handle "cuda_core::create_kernel_handle" ( const LibraryHandle& h_library, const char* name) except+ nogil KernelHandle create_kernel_handle_ref "cuda_core::create_kernel_handle_ref" ( - cydriver.CUkernel kernel, const LibraryHandle& h_library) except+ nogil + cydriver.CUkernel kernel) except+ nogil + LibraryHandle get_kernel_library "cuda_core::get_kernel_library" ( + const KernelHandle& h) noexcept nogil # Graphics resource handles GraphicsResourceHandle create_graphics_resource_handle "cuda_core::create_graphics_resource_handle" ( diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index e74b1fc672..2bc7e25d21 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -511,6 +511,42 @@ def test_kernel_from_handle_multiple_instances(get_saxpy_kernel_cubin): assert int(kernel1.handle) == int(kernel2.handle) == int(kernel3.handle) == handle +def test_kernel_from_handle_library_mismatch_warning(init_cuda): + """Kernel.from_handle warns when caller-supplied module differs from the kernel's library.""" + prog1 = Program(SAXPY_KERNEL, code_type="c++") + mod1 = prog1.compile("cubin", name_expressions=("saxpy",)) + kernel = mod1.get_kernel("saxpy") + handle = int(kernel.handle) + + prog2 = Program(SAXPY_KERNEL, code_type="c++") + mod2 = prog2.compile("cubin", name_expressions=("saxpy",)) + mod2.get_kernel("saxpy") + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + k = Kernel.from_handle(handle, mod2) + assert len(w) == 1 + assert "does not match" in str(w[0].message) + + assert k.attributes.max_threads_per_block() > 0 + + +def test_kernel_from_handle_foreign_kernel(init_cuda): + """Kernel.from_handle with a driver-level kernel not created by cuda.core.""" + prog = Program(SAXPY_KERNEL, code_type="c++") + mod = prog.compile("cubin", name_expressions=("saxpy",)) + cubin = mod.code + sym_map = mod.symbol_mapping + + cu_lib = handle_return(driver.cuLibraryLoadData(cubin, [], [], 0, [], [], 0)) + mangled = sym_map["saxpy"] + cu_kernel = handle_return(driver.cuLibraryGetKernel(cu_lib, mangled)) + handle = int(cu_kernel) + + k = Kernel.from_handle(handle) + assert k.attributes.max_threads_per_block() > 0 + + def test_kernel_keeps_library_alive(init_cuda): """Test that a Kernel keeps its underlying library alive after ObjectCode goes out of scope.""" import gc