Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 136 additions & 40 deletions cuda_core/cuda/core/_cpp/resource_handles.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ NvJitLinkDestroyFn p_nvJitLinkDestroy = nullptr;

namespace {

using cuda_core::detail::py_is_finalizing;

// Helper to release the GIL while calling into the CUDA driver.
// This guard is *conditional*: if the caller already dropped the GIL,
// we avoid calling PyEval_SaveThread (which requires holding the GIL).
Expand Down Expand Up @@ -148,6 +146,51 @@ class GILAcquireGuard {

} // namespace

// ============================================================================
// Handle reverse-lookup registry
//
// Maps raw CUDA handles (CUevent, CUkernel, etc.) back to their owning
// shared_ptr so that _ref constructors can recover full metadata.
// Uses weak_ptr to avoid preventing destruction.
// ============================================================================

template<typename Key, typename Handle, typename Hash = std::hash<Key>>
class HandleRegistry {
public:
using MapType = std::unordered_map<Key, std::weak_ptr<typename Handle::element_type>, Hash>;

void register_handle(const Key& key, const Handle& h) {
std::lock_guard<std::mutex> lock(mutex_);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: CTAD not available on MSVC compiler.

map_[key] = h;
}

void unregister_handle(const Key& key) noexcept {
try {
std::lock_guard<std::mutex> lock(mutex_);
auto it = map_.find(key);
if (it != map_.end() && it->second.expired()) {
map_.erase(it);
}
} catch (...) {}
}

Handle lookup(const Key& key) {
std::lock_guard<std::mutex> lock(mutex_);
auto it = map_.find(key);
if (it != map_.end()) {
if (auto h = it->second.lock()) {
return h;
}
map_.erase(it);
}
return {};
}

private:
std::mutex mutex_;
MapType map_;
};

// ============================================================================
// Thread-local error handling
// ============================================================================
Expand Down Expand Up @@ -306,47 +349,98 @@ StreamHandle get_per_thread_stream() {
namespace {
struct EventBox {
CUevent resource;
bool timing_disabled;
bool busy_waited;
bool ipc_enabled;
int device_id;
ContextHandle h_context;
};
} // namespace

EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) {
static const EventBox* get_box(const EventHandle& h) {
const CUevent* p = h.get();
return reinterpret_cast<const EventBox*>(
reinterpret_cast<const char*>(p) - offsetof(EventBox, resource)
);
}

bool get_event_timing_disabled(const EventHandle& h) noexcept {
return h ? get_box(h)->timing_disabled : true;
}

bool get_event_busy_waited(const EventHandle& h) noexcept {
return h ? get_box(h)->busy_waited : false;
}

bool get_event_ipc_enabled(const EventHandle& h) noexcept {
return h ? get_box(h)->ipc_enabled : false;
}

int get_event_device_id(const EventHandle& h) noexcept {
return h ? get_box(h)->device_id : -1;
}

ContextHandle get_event_context(const EventHandle& h) noexcept {
return h ? get_box(h)->h_context : ContextHandle{};
}

static HandleRegistry<CUevent, EventHandle> event_registry;

EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags,
bool timing_disabled, bool busy_waited,
bool ipc_enabled, int device_id) {
GILReleaseGuard gil;
CUevent event;
if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
return {};
}

auto box = std::shared_ptr<const EventBox>(
new EventBox{event},
new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx},
[h_ctx](const EventBox* b) {
event_registry.unregister_handle(b->resource);
GILReleaseGuard gil;
p_cuEventDestroy(b->resource);
delete b;
}
);
return EventHandle(box, &box->resource);
EventHandle h(box, &box->resource);
event_registry.register_handle(event, h);
return h;
}

EventHandle create_event_handle_noctx(unsigned int flags) {
return create_event_handle(ContextHandle{}, flags);
return create_event_handle(ContextHandle{}, flags, true, false, false, -1);
}

EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
EventHandle create_event_handle_ref(CUevent event) {
if (auto h = event_registry.lookup(event)) {
return h;
}
auto box = std::make_shared<const EventBox>(EventBox{event, true, false, false, -1, {}});
return EventHandle(box, &box->resource);
}

EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle,
bool busy_waited) {
GILReleaseGuard gil;
CUevent event;
if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
return {};
}

auto box = std::shared_ptr<const EventBox>(
new EventBox{event},
new EventBox{event, true, busy_waited, true, -1, {}},
[](const EventBox* b) {
event_registry.unregister_handle(b->resource);
GILReleaseGuard gil;
p_cuEventDestroy(b->resource);
delete b;
}
);
return EventHandle(box, &box->resource);
EventHandle h(box, &box->resource);
event_registry.register_handle(event, h);
return h;
}

// ============================================================================
Expand Down Expand Up @@ -653,61 +747,43 @@ struct ExportDataKeyHash {

}

static std::mutex ipc_ptr_cache_mutex;
static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
static HandleRegistry<ExportDataKey, DevicePtrHandle, ExportDataKeyHash> ipc_ptr_cache;
static std::mutex ipc_import_mutex;

DevicePtrHandle deviceptr_import_ipc(const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) {
auto data = const_cast<CUmemPoolPtrExportData*>(
reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));

if (use_ipc_ptr_cache()) {
// Check cache before calling cuMemPoolImportPointer
ExportDataKey key;
std::memcpy(&key.data, data, sizeof(key.data));

std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
std::lock_guard<std::mutex> lock(ipc_import_mutex);

auto it = ipc_ptr_cache.find(key);
if (it != ipc_ptr_cache.end()) {
if (auto box = it->second.lock()) {
// Cache hit - return existing handle
return DevicePtrHandle(box, &box->resource);
}
ipc_ptr_cache.erase(it); // Expired entry
if (auto h = ipc_ptr_cache.lookup(key)) {
return h;
}

// Cache miss - import the pointer
GILReleaseGuard gil;
CUdeviceptr ptr;
if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
return {};
}

// Create new handle with cache-clearing deleter
auto box = std::shared_ptr<DevicePtrBox>(
new DevicePtrBox{ptr, h_stream},
[h_pool, key](DevicePtrBox* b) {
ipc_ptr_cache.unregister_handle(key);
GILReleaseGuard gil;
try {
std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
// Only erase if expired - avoids race where another thread
// replaced the entry with a new import before we acquired the lock.
auto it = ipc_ptr_cache.find(key);
if (it != ipc_ptr_cache.end() && it->second.expired()) {
ipc_ptr_cache.erase(it);
}
} catch (...) {
// Cache cleanup is best-effort - swallow exceptions in destructor context
}
p_cuMemFreeAsync(b->resource, as_cu(b->h_stream));
delete b;
}
);
ipc_ptr_cache[key] = box;
return DevicePtrHandle(box, &box->resource);
DevicePtrHandle h(box, &box->resource);
ipc_ptr_cache.register_handle(key, h);
return h;

} else {
// No caching - simple handle creation
GILReleaseGuard gil;
CUdeviceptr ptr;
if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
Expand Down Expand Up @@ -786,25 +862,45 @@ LibraryHandle create_library_handle_ref(CUlibrary library) {
namespace {
struct KernelBox {
CUkernel resource;
LibraryHandle h_library; // Keeps library alive
LibraryHandle h_library;
};
} // namespace

static const KernelBox* get_box(const KernelHandle& h) {
const CUkernel* p = h.get();
return reinterpret_cast<const KernelBox*>(
reinterpret_cast<const char*>(p) - offsetof(KernelBox, resource)
);
}

static HandleRegistry<CUkernel, KernelHandle> kernel_registry;

KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) {
GILReleaseGuard gil;
CUkernel kernel;
if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) {
return {};
}

return create_kernel_handle_ref(kernel, h_library);
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
KernelHandle h(box, &box->resource);
kernel_registry.register_handle(kernel, h);
return h;
}

KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library) {
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
KernelHandle create_kernel_handle_ref(CUkernel kernel) {
if (auto h = kernel_registry.lookup(kernel)) {
return h;
}
auto box = std::make_shared<const KernelBox>(KernelBox{kernel, {}});
return KernelHandle(box, &box->resource);
}

LibraryHandle get_kernel_library(const KernelHandle& h) noexcept {
if (!h) return {};
return get_box(h)->h_library;
}

// ============================================================================
// Graphics Resource Handles
// ============================================================================
Expand Down
35 changes: 28 additions & 7 deletions cuda_core/cuda/core/_cpp/resource_handles.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,12 @@ StreamHandle get_per_thread_stream();

// Create an owning event handle by calling cuEventCreate.
// The event structurally depends on the provided context handle.
// Metadata fields are stored in the EventBox for later retrieval.
// When the last reference is released, cuEventDestroy is called automatically.
// Returns empty handle on error (caller must check).
EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags);
EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags,
bool timing_disabled, bool busy_waited,
bool ipc_enabled, int device_id);

// Create an owning event handle without context dependency.
// Use for temporary events that are created and destroyed in the same scope.
Expand All @@ -214,7 +217,21 @@ EventHandle create_event_handle_noctx(unsigned int flags);
// The originating process owns the event and its context.
// When the last reference is released, cuEventDestroy is called automatically.
// Returns empty handle on error (caller must check).
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle);
EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle,
bool busy_waited);

// Create a non-owning event handle (references existing event).
// Use for events that are managed by the CUDA graph or another owner.
// The event will NOT be destroyed when the handle is released.
// Metadata defaults to unknown (timing_disabled=true, device_id=-1).
EventHandle create_event_handle_ref(CUevent event);

// Event metadata accessors (read from EventBox via pointer arithmetic)
bool get_event_timing_disabled(const EventHandle& h) noexcept;
bool get_event_busy_waited(const EventHandle& h) noexcept;
bool get_event_ipc_enabled(const EventHandle& h) noexcept;
int get_event_device_id(const EventHandle& h) noexcept;
ContextHandle get_event_context(const EventHandle& h) noexcept;

// ============================================================================
// Memory pool handle functions
Expand Down Expand Up @@ -345,9 +362,14 @@ LibraryHandle create_library_handle_ref(CUlibrary library);
// Returns empty handle on error (caller must check).
KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name);

// Create a non-owning kernel handle with library dependency.
// Use for borrowed kernels. The library handle keeps the library alive.
KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library);
// Create a kernel handle from a raw CUkernel.
// If the kernel is already managed (in the registry), returns the owning
// handle with library dependency. Otherwise returns a non-owning ref.
KernelHandle create_kernel_handle_ref(CUkernel kernel);

// Get the library handle associated with a kernel (from KernelBox).
// Returns empty handle if the kernel has no library dependency.
LibraryHandle get_kernel_library(const KernelHandle& h) noexcept;

// ============================================================================
// Graphics resource handle functions
Expand Down Expand Up @@ -516,8 +538,6 @@ inline std::intptr_t as_intptr(const CuLinkHandle& h) noexcept {
}

// as_py() - convert handle to Python wrapper object (returns new reference)
namespace detail {

#if PY_VERSION_HEX < 0x030D0000
extern "C" int _Py_IsFinalizing(void);
#endif
Expand All @@ -530,6 +550,7 @@ inline bool py_is_finalizing() noexcept {
#endif
}

namespace detail {
// n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md
inline PyObject* make_py(const char* module_name, const char* class_name, std::intptr_t value) noexcept {
if (py_is_finalizing()) {
Expand Down
8 changes: 3 additions & 5 deletions cuda_core/cuda/core/_event.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,13 @@ cdef class Event:

cdef:
EventHandle _h_event
ContextHandle _h_context
bint _timing_disabled
bint _busy_waited
bint _ipc_enabled
object _ipc_descriptor
int _device_id
object __weakref__

@staticmethod
cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free)

@staticmethod
cdef Event _from_handle(EventHandle h_event)

cpdef close(self)
Loading
Loading