NVIDIA · Andy-Jost · Mar 3, 2026 · Mar 12, 2026 · Andy-Jost · Mar 13, 2026
diff --git a/cuda_core/cuda/core/_cpp/resource_handles.cpp b/cuda_core/cuda/core/_cpp/resource_handles.cpp
@@ -77,8 +77,6 @@ NvJitLinkDestroyFn p_nvJitLinkDestroy = nullptr;
 
 namespace {
 
-using cuda_core::detail::py_is_finalizing;
-
 // Helper to release the GIL while calling into the CUDA driver.
 // This guard is *conditional*: if the caller already dropped the GIL,
 // we avoid calling PyEval_SaveThread (which requires holding the GIL).
@@ -148,6 +146,51 @@ class GILAcquireGuard {
 
 }  // namespace
 
+// ============================================================================
+// Handle reverse-lookup registry
+//
+// Maps raw CUDA handles (CUevent, CUkernel, etc.) back to their owning
+// shared_ptr so that _ref constructors can recover full metadata.
+// Uses weak_ptr to avoid preventing destruction.
+// ============================================================================
+
+template<typename Key, typename Handle, typename Hash = std::hash<Key>>
+class HandleRegistry {
+public:
+    using MapType = std::unordered_map<Key, std::weak_ptr<typename Handle::element_type>, Hash>;
+
+    void register_handle(const Key& key, const Handle& h) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        map_[key] = h;
+    }
+
+    void unregister_handle(const Key& key) noexcept {
+        try {
+            std::lock_guard<std::mutex> lock(mutex_);
+            auto it = map_.find(key);
+            if (it != map_.end() && it->second.expired()) {
+                map_.erase(it);
+            }
+        } catch (...) {}
+    }
+
+    Handle lookup(const Key& key) {
+        std::lock_guard<std::mutex> lock(mutex_);
+        auto it = map_.find(key);
+        if (it != map_.end()) {
+            if (auto h = it->second.lock()) {
+                return h;
+            }
+            map_.erase(it);
+        }
+        return {};
+    }
+
+private:
+    std::mutex mutex_;
+    MapType map_;
+};
+
 // ============================================================================
 // Thread-local error handling
 // ============================================================================
@@ -306,47 +349,98 @@ StreamHandle get_per_thread_stream() {
 namespace {
 struct EventBox {
     CUevent resource;
+    bool timing_disabled;
+    bool busy_waited;
+    bool ipc_enabled;
+    int device_id;
+    ContextHandle h_context;
 };
 }  // namespace
 
-EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags) {
+static const EventBox* get_box(const EventHandle& h) {
+    const CUevent* p = h.get();
+    return reinterpret_cast<const EventBox*>(
+        reinterpret_cast<const char*>(p) - offsetof(EventBox, resource)
+    );
+}
+
+bool get_event_timing_disabled(const EventHandle& h) noexcept {
+    return h ? get_box(h)->timing_disabled : true;
+}
+
+bool get_event_busy_waited(const EventHandle& h) noexcept {
+    return h ? get_box(h)->busy_waited : false;
+}
+
+bool get_event_ipc_enabled(const EventHandle& h) noexcept {
+    return h ? get_box(h)->ipc_enabled : false;
+}
+
+int get_event_device_id(const EventHandle& h) noexcept {
+    return h ? get_box(h)->device_id : -1;
+}
+
+ContextHandle get_event_context(const EventHandle& h) noexcept {
+    return h ? get_box(h)->h_context : ContextHandle{};
+}
+
+static HandleRegistry<CUevent, EventHandle> event_registry;
+
+EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags,
+                                bool timing_disabled, bool busy_waited,
+                                bool ipc_enabled, int device_id) {
     GILReleaseGuard gil;
     CUevent event;
     if (CUDA_SUCCESS != (err = p_cuEventCreate(&event, flags))) {
         return {};
     }
 
     auto box = std::shared_ptr<const EventBox>(
-        new EventBox{event},
+        new EventBox{event, timing_disabled, busy_waited, ipc_enabled, device_id, h_ctx},
         [h_ctx](const EventBox* b) {
+            event_registry.unregister_handle(b->resource);
             GILReleaseGuard gil;
             p_cuEventDestroy(b->resource);
             delete b;
         }
     );
-    return EventHandle(box, &box->resource);
+    EventHandle h(box, &box->resource);
+    event_registry.register_handle(event, h);
+    return h;
 }
 
 EventHandle create_event_handle_noctx(unsigned int flags) {
-    return create_event_handle(ContextHandle{}, flags);
+    return create_event_handle(ContextHandle{}, flags, true, false, false, -1);
 }
 
-EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle) {
+EventHandle create_event_handle_ref(CUevent event) {
+    if (auto h = event_registry.lookup(event)) {
+        return h;
+    }
+    auto box = std::make_shared<const EventBox>(EventBox{event, true, false, false, -1, {}});
+    return EventHandle(box, &box->resource);
+}
+
+EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle,
+                                    bool busy_waited) {
     GILReleaseGuard gil;
     CUevent event;
     if (CUDA_SUCCESS != (err = p_cuIpcOpenEventHandle(&event, ipc_handle))) {
         return {};
     }
 
     auto box = std::shared_ptr<const EventBox>(
-        new EventBox{event},
+        new EventBox{event, true, busy_waited, true, -1, {}},
         [](const EventBox* b) {
+            event_registry.unregister_handle(b->resource);
             GILReleaseGuard gil;
             p_cuEventDestroy(b->resource);
             delete b;
         }
     );
-    return EventHandle(box, &box->resource);
+    EventHandle h(box, &box->resource);
+    event_registry.register_handle(event, h);
+    return h;
 }
 
 // ============================================================================
@@ -653,61 +747,43 @@ struct ExportDataKeyHash {
 
 }
 
-static std::mutex ipc_ptr_cache_mutex;
-static std::unordered_map<ExportDataKey, std::weak_ptr<DevicePtrBox>, ExportDataKeyHash> ipc_ptr_cache;
+static HandleRegistry<ExportDataKey, DevicePtrHandle, ExportDataKeyHash> ipc_ptr_cache;
+static std::mutex ipc_import_mutex;
 
 DevicePtrHandle deviceptr_import_ipc(const MemoryPoolHandle& h_pool, const void* export_data, const StreamHandle& h_stream) {
     auto data = const_cast<CUmemPoolPtrExportData*>(
         reinterpret_cast<const CUmemPoolPtrExportData*>(export_data));
 
     if (use_ipc_ptr_cache()) {
-        // Check cache before calling cuMemPoolImportPointer
         ExportDataKey key;
         std::memcpy(&key.data, data, sizeof(key.data));
 
-        std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
+        std::lock_guard<std::mutex> lock(ipc_import_mutex);
 
-        auto it = ipc_ptr_cache.find(key);
-        if (it != ipc_ptr_cache.end()) {
-            if (auto box = it->second.lock()) {
-                // Cache hit - return existing handle
-                return DevicePtrHandle(box, &box->resource);
-            }
-            ipc_ptr_cache.erase(it);  // Expired entry
+        if (auto h = ipc_ptr_cache.lookup(key)) {
+            return h;
         }
 
-        // Cache miss - import the pointer
         GILReleaseGuard gil;
         CUdeviceptr ptr;
         if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
             return {};
         }
 
-        // Create new handle with cache-clearing deleter
         auto box = std::shared_ptr<DevicePtrBox>(
             new DevicePtrBox{ptr, h_stream},
             [h_pool, key](DevicePtrBox* b) {
+                ipc_ptr_cache.unregister_handle(key);
                 GILReleaseGuard gil;
-                try {
-                    std::lock_guard<std::mutex> lock(ipc_ptr_cache_mutex);
-                    // Only erase if expired - avoids race where another thread
-                    // replaced the entry with a new import before we acquired the lock.
-                    auto it = ipc_ptr_cache.find(key);
-                    if (it != ipc_ptr_cache.end() && it->second.expired()) {
-                        ipc_ptr_cache.erase(it);
-                    }
-                } catch (...) {
-                    // Cache cleanup is best-effort - swallow exceptions in destructor context
-                }
                 p_cuMemFreeAsync(b->resource, as_cu(b->h_stream));
                 delete b;
             }
         );
-        ipc_ptr_cache[key] = box;
-        return DevicePtrHandle(box, &box->resource);
+        DevicePtrHandle h(box, &box->resource);
+        ipc_ptr_cache.register_handle(key, h);
+        return h;
 
     } else {
-        // No caching - simple handle creation
         GILReleaseGuard gil;
         CUdeviceptr ptr;
         if (CUDA_SUCCESS != (err = p_cuMemPoolImportPointer(&ptr, *h_pool, data))) {
@@ -786,25 +862,45 @@ LibraryHandle create_library_handle_ref(CUlibrary library) {
 namespace {
 struct KernelBox {
     CUkernel resource;
-    LibraryHandle h_library;  // Keeps library alive
+    LibraryHandle h_library;
 };
 }  // namespace
 
+static const KernelBox* get_box(const KernelHandle& h) {
+    const CUkernel* p = h.get();
+    return reinterpret_cast<const KernelBox*>(
+        reinterpret_cast<const char*>(p) - offsetof(KernelBox, resource)
+    );
+}
+
+static HandleRegistry<CUkernel, KernelHandle> kernel_registry;
+
 KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name) {
     GILReleaseGuard gil;
     CUkernel kernel;
     if (CUDA_SUCCESS != (err = p_cuLibraryGetKernel(&kernel, *h_library, name))) {
         return {};
     }
 
-    return create_kernel_handle_ref(kernel, h_library);
+    auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
+    KernelHandle h(box, &box->resource);
+    kernel_registry.register_handle(kernel, h);
+    return h;
 }
 
-KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library) {
-    auto box = std::make_shared<const KernelBox>(KernelBox{kernel, h_library});
+KernelHandle create_kernel_handle_ref(CUkernel kernel) {
+    if (auto h = kernel_registry.lookup(kernel)) {
+        return h;
+    }
+    auto box = std::make_shared<const KernelBox>(KernelBox{kernel, {}});
     return KernelHandle(box, &box->resource);
 }
 
+LibraryHandle get_kernel_library(const KernelHandle& h) noexcept {
+    if (!h) return {};
+    return get_box(h)->h_library;
+}
+
 // ============================================================================
 // Graphics Resource Handles
 // ============================================================================

diff --git a/cuda_core/cuda/core/_cpp/resource_handles.hpp b/cuda_core/cuda/core/_cpp/resource_handles.hpp
@@ -200,9 +200,12 @@ StreamHandle get_per_thread_stream();
 
 // Create an owning event handle by calling cuEventCreate.
 // The event structurally depends on the provided context handle.
+// Metadata fields are stored in the EventBox for later retrieval.
 // When the last reference is released, cuEventDestroy is called automatically.
 // Returns empty handle on error (caller must check).
-EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags);
+EventHandle create_event_handle(const ContextHandle& h_ctx, unsigned int flags,
+                                bool timing_disabled, bool busy_waited,
+                                bool ipc_enabled, int device_id);
 
 // Create an owning event handle without context dependency.
 // Use for temporary events that are created and destroyed in the same scope.
@@ -214,7 +217,21 @@ EventHandle create_event_handle_noctx(unsigned int flags);
 // The originating process owns the event and its context.
 // When the last reference is released, cuEventDestroy is called automatically.
 // Returns empty handle on error (caller must check).
-EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle);
+EventHandle create_event_handle_ipc(const CUipcEventHandle& ipc_handle,
+                                    bool busy_waited);
+
+// Create a non-owning event handle (references existing event).
+// Use for events that are managed by the CUDA graph or another owner.
+// The event will NOT be destroyed when the handle is released.
+// Metadata defaults to unknown (timing_disabled=true, device_id=-1).
+EventHandle create_event_handle_ref(CUevent event);
+
+// Event metadata accessors (read from EventBox via pointer arithmetic)
+bool get_event_timing_disabled(const EventHandle& h) noexcept;
+bool get_event_busy_waited(const EventHandle& h) noexcept;
+bool get_event_ipc_enabled(const EventHandle& h) noexcept;
+int get_event_device_id(const EventHandle& h) noexcept;
+ContextHandle get_event_context(const EventHandle& h) noexcept;
 
 // ============================================================================
 // Memory pool handle functions
@@ -345,9 +362,14 @@ LibraryHandle create_library_handle_ref(CUlibrary library);
 // Returns empty handle on error (caller must check).
 KernelHandle create_kernel_handle(const LibraryHandle& h_library, const char* name);
 
-// Create a non-owning kernel handle with library dependency.
-// Use for borrowed kernels. The library handle keeps the library alive.
-KernelHandle create_kernel_handle_ref(CUkernel kernel, const LibraryHandle& h_library);
+// Create a kernel handle from a raw CUkernel.
+// If the kernel is already managed (in the registry), returns the owning
+// handle with library dependency. Otherwise returns a non-owning ref.
+KernelHandle create_kernel_handle_ref(CUkernel kernel);
+
+// Get the library handle associated with a kernel (from KernelBox).
+// Returns empty handle if the kernel has no library dependency.
+LibraryHandle get_kernel_library(const KernelHandle& h) noexcept;
 
 // ============================================================================
 // Graphics resource handle functions
@@ -516,8 +538,6 @@ inline std::intptr_t as_intptr(const CuLinkHandle& h) noexcept {
 }
 
 // as_py() - convert handle to Python wrapper object (returns new reference)
-namespace detail {
-
 #if PY_VERSION_HEX < 0x030D0000
 extern "C" int _Py_IsFinalizing(void);
 #endif
@@ -530,6 +550,7 @@ inline bool py_is_finalizing() noexcept {
 #endif
 }
 
+namespace detail {
 // n.b. class lookup is not cached to avoid deadlock hazard, see DESIGN.md
 inline PyObject* make_py(const char* module_name, const char* class_name, std::intptr_t value) noexcept {
     if (py_is_finalizing()) {

diff --git a/cuda_core/cuda/core/_event.pxd b/cuda_core/cuda/core/_event.pxd
@@ -10,15 +10,13 @@ cdef class Event:
 
     cdef:
         EventHandle _h_event
-        ContextHandle _h_context
-        bint _timing_disabled
-        bint _busy_waited
-        bint _ipc_enabled
         object _ipc_descriptor
-        int _device_id
         object __weakref__
 
     @staticmethod
     cdef Event _init(type cls, int device_id, ContextHandle h_context, options, bint is_free)
 
+    @staticmethod
+    cdef Event _from_handle(EventHandle h_event)
+
     cpdef close(self)