From 0cd109a474a09947b135b3e9e04979d4af686b97 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:44:01 +0700 Subject: [PATCH 001/149] Add more cuda function to load --- include/nbl/video/CCUDAHandler.h | 51 +++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..5f165b207a 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -18,7 +18,7 @@ namespace nbl::video class CCUDAHandler : public core::IReferenceCounted { - public: + public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); inline bool defaultHandleResult(CUresult result) { @@ -34,7 +34,7 @@ class CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; @@ -119,6 +119,24 @@ class CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -157,13 +175,25 @@ class CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::future bytesRead; + system::IFile::success_t bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); + source.resize(bytesRead.getBytesProcessed()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } + struct SCUDADeviceInfo + { + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + }; + + inline core::vector const& getAvailableDevices() const + { + return m_availableDevices; + } + // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -228,16 +258,8 @@ class CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + ~CCUDAHandler() = default; // @@ -260,6 +282,7 @@ class CCUDAHandler : public core::IReferenceCounted NVRTC m_nvrtc; // + core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; From bbe25abc6c93430b2ad4ad350fb8d37dd0bc3663 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:44:48 +0700 Subject: [PATCH 002/149] Add _NBL_COMPILE_WITH_CUDA_ compile definition on CMakeLists.txt --- src/nbl/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 9c994bfa41..a680a19eab 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -422,6 +422,10 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() +if(NBL_COMPILE_WITH_CUDA) + target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) +endif() + set(INTERFACE_BUILD_DEFINITIONS _DXC_DLL_="${DXC_DLL}" ) From d74349e590492206df24efa742ac76d8977f839a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:47:23 +0700 Subject: [PATCH 003/149] Move CCudaHandler constructor to cpp and query device info and attributes --- src/nbl/video/CCUDAHandler.cpp | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 7fb60d79bf..c111f3c73e 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -11,6 +11,47 @@ namespace nbl::video { +CCUDAHandler::CCUDAHandler( + CUDA&& _cuda, + NVRTC&& _nvrtc, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger, + int _version) + : m_cuda(std::move(_cuda)) + , m_nvrtc(std::move(_nvrtc)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) + , m_version(_version) +{ + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + return; + + for (int device_i = 0; device_i < deviceCount; device_i++) + { + CUdevice handle = -1; + if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) + continue; + + CUuuid uuid = {}; + if (m_cuda.pcuDeviceGetUuid(&uuid, handle) != CUDA_SUCCESS) + continue; + + m_availableDevices.emplace_back(handle, uuid); + + int* attributes = m_availableDevices.back().attributes; + for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) + m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + + } +} bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) From 38ed6dbd4affb940430f2367db4ad287ef8fe1e8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:47:58 +0700 Subject: [PATCH 004/149] Add missing CFileView.h header in CCudaHandler.cpp --- src/nbl/video/CCUDAHandler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index c111f3c73e..c8f8a328be 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -3,6 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDAHandler.h" +#include "nbl/system/CFileView.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "jitify/jitify.hpp" From 95338cd941a213381580bd01ddc64f2ff47e698f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:48:14 +0700 Subject: [PATCH 005/149] Fix indentation of CCudaHandler.cpp --- src/nbl/video/CCUDAHandler.cpp | 50 ++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index c8f8a328be..1f723ba641 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -53,6 +53,7 @@ CCUDAHandler::CCUDAHandler( } } + bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) @@ -452,7 +453,40 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste NVRTC nvrtc = {}; #if defined(_NBL_WINDOWS_API_) // Perpetual TODO: any new CUDA releases we need to account for? - const char* nvrtc64_versions[] = { "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; + // Version List: https://developer.nvidia.com/cuda-toolkit-archive + const char* nvrtc64_versions[] = { + "nvrtc64_131", + "nvrtc64_130", + "nvrtc64_129", + "nvrtc64_128", + "nvrtc64_126", + "nvrtc64_125", + "nvrtc64_124", + "nvrtc64_123", + "nvrtc64_122", + "nvrtc64_121", + "nvrtc64_120", + "nvrtc64_118", + "nvrtc64_117", + "nvrtc64_116", + "nvrtc64_115", + "nvrtc64_114", + "nvrtc64_113", + "nvrtc64_112", + "nvrtc64_111", + "nvrtc64_110", + "nvrtc64_102", + "nvrtc64_101", + "nvrtc64_100", + "nvrtc64_92", + "nvrtc64_91", + "nvrtc64_90", + "nvrtc64_80", + "nvrtc64_75", + "nvrtc64_70", + nullptr + }; + const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; for (auto verpath=nvrtc64_versions; *verpath; verpath++) { @@ -567,11 +601,11 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) return nullptr; - for (int ordinal=0; ordinal CCUDAHandler::createDevice(core::smart_refct CUuuid uuid = {}; if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) continue; - if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) { int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; for (int i=0; i CCUDAHandler::createDevice(core::smart_refct continue; auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); - } - } + return core::smart_refctd_ptr(device,core::dont_grab); + } + } return nullptr; } From 3e9dfd2e0c5a03171df9ab542c9499200814b225 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:22:04 +0700 Subject: [PATCH 006/149] Add NBL_API2 to CCudaHandler --- include/nbl/video/CCUDAHandler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 5f165b207a..ef040f5536 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,7 +16,7 @@ namespace nbl::video { -class CCUDAHandler : public core::IReferenceCounted +class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); From 1ae7747fecb3e3d080dbc90f44f0a1e86d977efe Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:22:40 +0700 Subject: [PATCH 007/149] Fix fetching deviceUUID logic --- src/nbl/video/CCUDAHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 1f723ba641..9dbf92e770 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -614,7 +614,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct CUuuid uuid = {}; if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) continue; - if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; for (int i=0; i Date: Sat, 28 Feb 2026 15:22:58 +0700 Subject: [PATCH 008/149] Fix usage of CFileView --- src/nbl/video/CCUDAHandler.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 9dbf92e770..75e372b705 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -544,8 +544,10 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { const void* contents = it.second.data(); headers.push_back(core::make_smart_refctd_ptr>( - core::smart_refctd_ptr(system),it.first.c_str(), + it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, + // ASK(kevin): What initial_modified_time should I use? Is this how this parameter is used? + std::chrono::clock_cast(std::chrono::system_clock::now()), const_cast(contents),it.second.size()+1u )); } From 5018be7b8821ec3b09a381d2ca32d189a9d8f0df Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:23:13 +0700 Subject: [PATCH 009/149] Fix use after move of ptx cpuBuffer --- src/nbl/video/CCUDAHandler.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 75e372b705..d0f8043b17 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -591,8 +591,9 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = asset::ICPUBuffer::create({ _size }); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; + auto ptx = asset::ICPUBuffer::create({_size}); + auto ptxPtr = static_cast(ptx->getPointer()); + return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) From 5251b4def078cf20e2080bf89c9d55ac9c3781e8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:25:19 +0700 Subject: [PATCH 010/149] Improve cpuBuffer initialization using params instead of aggregrate initializer --- src/nbl/video/CCUDAHandler.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index d0f8043b17..4dbf0cb488 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -591,7 +591,9 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = asset::ICPUBuffer::create({_size}); + asset::ICPUBuffer::SCreationParams ptxParams = {}; + ptxParams.size = _size; + auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); auto ptxPtr = static_cast(ptx->getPointer()); return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } From d655b1977381c92abf0a0b496e818d63ae3ea009 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 16:48:58 +0700 Subject: [PATCH 011/149] Fix indentation of CCudaHandler.cpp into tabs --- src/nbl/video/CCUDAHandler.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 4dbf0cb488..aac9dc67cc 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -606,11 +606,11 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) return nullptr; - for (int ordinal=0; ordinal CCUDAHandler::createDevice(core::smart_refct CUuuid uuid = {}; if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) continue; - if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; for (int i=0; i CCUDAHandler::createDevice(core::smart_refct continue; auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); - } + return core::smart_refctd_ptr(device,core::dont_grab); } + } return nullptr; } From 454710b3aa6cbf8c303cfbbf5ff435218a38db42 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 16:54:49 +0700 Subject: [PATCH 012/149] Iterate m_availableDevices when creatingDevice --- src/nbl/video/CCUDAHandler.cpp | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index aac9dc67cc..add5e3db92 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -606,28 +606,13 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) - return nullptr; - - for (int ordinal=0; ordinalgetProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - for (int i=0; i(i),handle); - CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; - const int& archMajor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; - const int& archMinor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; + const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; + const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; switch (archMajor) { case 3: From 4645bc4214422b4bb5678d0010d55d9d0792033d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 18:07:36 +0700 Subject: [PATCH 013/149] Implement context creation in CCUDADevice --- include/nbl/video/CCUDADevice.h | 6 +++++- src/nbl/video/CCUDADevice.cpp | 21 +++++++++++++++++++-- src/nbl/video/CCUDAHandler.cpp | 3 +-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 1120224fdb..b204b98b23 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -182,13 +182,17 @@ class CCUDADevice : public core::IReferenceCounted protected: friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); + CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); ~CCUDADevice() = default; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; IPhysicalDevice* const m_vulkanDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + + core::smart_refctd_ptr m_handler; + CUdevice m_handle; + CUcontext m_context; }; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 4d2e880095..bf96c6e78d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -7,13 +7,30 @@ namespace nbl::video { -CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture) - : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture) +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& _vulkanConnection, + IPhysicalDevice* const _vulkanDevice, + const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, + CUdevice _device, + core::smart_refctd_ptr&& _handler) : + m_defaultCompileOptions(), + m_vulkanConnection(std::move(_vulkanConnection)), + m_vulkanDevice(_vulkanDevice), + m_virtualArchitecture(_virtualArchitecture), + m_handle(_device), + m_handler(std::move(_handler)) { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); + + auto& cu = m_handler->getCUDAFunctionTable(); + + CUresult re = cu.pcuCtxCreate_v2(&m_context, 0, m_handle); + assert(CUDA_SUCCESS == re); + re = cu.pcuCtxSetCurrent(m_context); + assert(CUDA_SUCCESS == re); } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index add5e3db92..0eba770c89 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -690,8 +690,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); + return core::smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); } } return nullptr; From 3172ae76cf5da9a49bcb53bdd34b44830c6c125f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 12 Mar 2026 19:46:34 +0700 Subject: [PATCH 014/149] Implement physical device getExternalMemoryProperties --- include/nbl/video/IDeviceMemoryAllocation.h | 13 +++++++ include/nbl/video/IDeviceMemoryBacked.h | 2 ++ include/nbl/video/IPhysicalDevice.h | 40 +++++++++++++++++++++ src/nbl/video/CVulkanPhysicalDevice.cpp | 19 ++++++++++ src/nbl/video/CVulkanPhysicalDevice.h | 2 ++ 5 files changed, 76 insertions(+) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 00e55a66e3..46f77975fb 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -68,6 +68,19 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted EMHF_MULTI_INSTANCE_BIT = 0x00000002, }; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D11_TEXTURE = 0x00000008, + EHT_D3D11_TEXTURE_KMT = 0x00000010, + EHT_D3D12_HEAP = 0x00000020, + EHT_D3D12_RESOURCE = 0x00000040, + EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, + }; + // const ILogicalDevice* getOriginDevice() const {return m_originDevice;} diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index b0c0ce05ed..04693456d7 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -39,6 +39,8 @@ class IDeviceMemoryBacked : public IBackendObject // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects bool skipHandleDestroy = false; + core::bitflag externalHandleTypes = IDeviceMemoryAllocation::EHT_NONE; + //! If you specify multiple queue family indices, then you're concurrent sharing inline bool isConcurrentSharing() const { diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 4222a22153..c1a703c993 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -639,6 +639,43 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return std::span(m_initData.qfamProperties->data(),m_initData.qfamProperties->data()+m_initData.qfamProperties->size()); } + enum class E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t + { + EEMF_NONE = 0x0, + EEMF_DEDICATED_ONLY_BIT = 0x1, + EEMF_EXPORTABLE_BIT = 0x2, + EEMF_IMPORTABLE_BIT = 0x4, + }; + + struct SExternalMemoryProperties + { + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE exportableTypes : 7; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE compatibleTypes : 7; + // TODO(kevin): This should actually be core::bitflag to be semantically correct. What should we do? Should we use bool for each flag instead of enum? + E_EXTERNAL_MEMORY_FEATURE_FLAGS features : 3; + bool operator == (SExternalMemoryProperties const& rhs) const = default; + }; + static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); + + SExternalMemoryProperties getExternalBufferProperties( + core::bitflag usages, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + usages &= ~asset::IBuffer::EUF_SYNTHEHIC_FLAGS_MASK; // mask out synthetic flags + + // TODO(kevinyu): Should we cached the properties like Atil does. If yes, needs mutex and mutable specifier. Class become not that simple anymore. + // { + // std::shared_lock lock(m_externalBufferPropertiesMutex); + // auto it = m_externalBufferProperties.find({ usage, handleType }); + // if (it != m_externalBufferProperties.end()) + // return it->second; + // } + // + // std::unique_lock lock(m_externalBufferPropertiesMutex); + // return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType); + return getExternalMemoryProperties_impl(usages, handleType); + } + struct SBufferFormatPromotionRequest { asset::E_FORMAT originalFormat = asset::EF_UNKNOWN; SFormatBufferUsages::SUsage usages = SFormatBufferUsages::SUsage(); @@ -683,6 +720,9 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable }; inline IPhysicalDevice(SInitData&& _initData) : m_initData(std::move(_initData)) {} + // External memory properties query + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + // ILogicalDevice creation bool validateLogicalDeviceCreation(const ILogicalDevice::SCreationParams& params) const; virtual core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) = 0; diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index da86d7c9d9..54e8543668 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1371,6 +1371,25 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart #undef RETURN_NULL_PHYSICAL_DEVICE +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + assert(!(handleType & (handleType - 1))); + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO, + .usage = static_cast(usages.value), + .handleType = static_cast(handleType) + }; + VkExternalBufferProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES }; + vkGetPhysicalDeviceExternalBufferProperties(m_vkPhysicalDevice, &info, &externalProps); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) { // We might alter it to account for dependancies. diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index c1552c88f1..5cb2556d6e 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,6 +109,8 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } + SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const; + core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; private: From f9b8b4fe51848661de1e89370ad838526d2114af Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 12 Mar 2026 21:15:17 +0700 Subject: [PATCH 015/149] Dedicated buffer and image --- include/nbl/asset/IBuffer.h | 2 ++ include/nbl/video/CVulkanDeviceMemoryBacked.h | 6 +++--- include/nbl/video/ILogicalDevice.h | 4 ++-- src/nbl/video/CVulkanBuffer.h | 2 +- src/nbl/video/CVulkanDeviceMemoryBacked.cpp | 6 +++--- src/nbl/video/CVulkanLogicalDevice.cpp | 8 ++++---- src/nbl/video/CVulkanLogicalDevice.h | 4 ++-- 7 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index 3a7cbb5983..92ffd3eb4d 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -42,6 +42,8 @@ class IBuffer : public IDescriptor, public core::IBuffer //! synthetic Nabla inventions // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, + + EUF_SYNTHEHIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index e6d17ddf3e..696d69058f 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -35,11 +35,11 @@ class CVulkanDeviceMemoryBacked : public Interface protected: // special constructor for when memory requirements are known up-front (so far only swapchains and internal forwarding here) CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const IDeviceMemoryBacked::SDeviceMemoryRequirements& _memReqs, const VkResource_t vkHandle); - CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const VkResource_t vkHandle) : - CVulkanDeviceMemoryBacked(dev,std::move(_creationParams),obtainRequirements(dev,vkHandle),vkHandle) {} + CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, bool dedicatedOnly, const VkResource_t vkHandle) : + CVulkanDeviceMemoryBacked(dev,std::move(_creationParams), obtainRequirements(dev, dedicatedOnly, vkHandle),vkHandle) {} private: - static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle); + static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle); core::smart_refctd_ptr m_memory = nullptr; size_t m_offset = 0u; diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 756b417c79..b12f4be333 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -1129,9 +1129,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual bool bindBufferMemory_impl(const uint32_t count, const SBindBufferMemoryInfo* pInfos) = 0; virtual bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) = 0; - virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) = 0; - virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) = 0; + virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; diff --git a/src/nbl/video/CVulkanBuffer.h b/src/nbl/video/CVulkanBuffer.h index 4596981c2a..944d7db205 100644 --- a/src/nbl/video/CVulkanBuffer.h +++ b/src/nbl/video/CVulkanBuffer.h @@ -16,7 +16,7 @@ class CVulkanBuffer : public CVulkanDeviceMemoryBacked using base_t = CVulkanDeviceMemoryBacked; public: - inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, const VkBuffer buffer) : base_t(dev,std::move(creationParams),buffer) {} + inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly, const VkBuffer buffer) : base_t(dev, std::move(creationParams), dedicatedOnly, buffer) {} void setObjectDebugName(const char* label) const override; diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 90b2993cb3..39c0efae19 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -6,7 +6,7 @@ namespace nbl::video { template -IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle) +IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle) { const std::conditional_t vk_memoryRequirementsInfo = { IsImage ? VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2:VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,nullptr,vkHandle @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) +core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR @@ -319,7 +319,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUB VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(creationParams),vk_buffer); + return core::make_smart_refctd_ptr(this, std::move(creationParams), dedicatedOnly, vk_buffer); } core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) @@ -338,7 +338,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_im return nullptr; } -core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params) +core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) { const bool hasStencil = asset::isDepthOrStencilFormat(params.format) && !asset::isDepthOnlyFormat(params.format); VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, nullptr }; @@ -377,7 +377,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma VkImage vk_image; if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(params),vk_image); + return core::make_smart_refctd_ptr(this, std::move(params), dedicatedOnly, vk_image); } core::smart_refctd_ptr CVulkanLogicalDevice::createImageView_impl(IGPUImageView::SCreationParams&& params) diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index e77386cb34..8f43a6783a 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -110,9 +110,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) override; // descriptor creation - core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) override; + core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) override; core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) override; - core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) override; + core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) override; core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) override; VkAccelerationStructureKHR createAccelerationStructure(const IGPUAccelerationStructure::SCreationParams& params, const VkAccelerationStructureTypeKHR type, const VkAccelerationStructureMotionInfoNV* motionInfo=nullptr); inline core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) override From a2357e2d8ec16bef81c9b4763964ce3db27c9bb3 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Mar 2026 10:37:50 +0700 Subject: [PATCH 016/149] External Memory Feature flags should not be enum class --- include/nbl/video/IPhysicalDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index c1a703c993..2ae58f22e3 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -639,7 +639,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return std::span(m_initData.qfamProperties->data(),m_initData.qfamProperties->data()+m_initData.qfamProperties->size()); } - enum class E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t + enum E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t { EEMF_NONE = 0x0, EEMF_DEDICATED_ONLY_BIT = 0x1, From 0d9c3d81f2b6681c28a7a00e54eb823260f7f4c1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Mar 2026 11:56:17 +0700 Subject: [PATCH 017/149] External Vulkan Buffer Creation --- include/nbl/video/ILogicalDevice.h | 16 +------------ src/nbl/video/CVulkanLogicalDevice.cpp | 8 ++++++- src/nbl/video/ILogicalDevice.cpp | 32 ++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index b12f4be333..1c7393bb57 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -331,21 +331,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams) - { - const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; - if (creationParams.size>maxSize) - { - m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit (%u)!",system::ILogger::ELL_ERROR,creationParams.size,this,maxSize); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Buffer, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - return createBuffer_impl(std::move(creationParams)); - } + inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 9bf85a5b1e..32e9ac2022 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -301,9 +301,15 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { + + VkExternalMemoryBufferCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO, + .handleTypes = creationParams.externalHandleTypes.value, + }; + VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR - vk_createInfo.pNext = nullptr; + vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage); diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 7958efa5c0..01e49a26d2 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -298,6 +298,38 @@ bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyIndex, asse return true; } +core::smart_refctd_ptr ILogicalDevice::createBuffer(IGPUBuffer::SCreationParams&& creationParams) +{ + const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; + if (creationParams.size > maxSize) + { + m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!", system::ILogger::ELL_ERROR, creationParams.size, this, maxSize); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + return createBuffer_impl(std::move(creationParams), dedicatedOnly); +} IQueue::RESULT ILogicalDevice::waitIdle() { From 89f5ae54224aa619adbafe275f68f3557f4b87df Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Mar 2026 14:44:19 +0700 Subject: [PATCH 018/149] Temporary enable compile with cuda flag --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ba3410075..e0068b002a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,7 +70,8 @@ else() message(STATUS "Vulkan SDK is not found") endif() -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) +# TODO(kevinyu): Turn off this flag after I finish developing the PR. +option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" ON) if(NBL_COMPILE_WITH_CUDA) find_package(CUDAToolkit REQUIRED) From 152830f613e8ce1cf7114babc450aa8544f8fb8f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Mar 2026 12:18:59 +0700 Subject: [PATCH 019/149] Update examples_tests submodule to vk_cuda interop demo branch --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 8f045a1c27..b8abd200a1 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8f045a1c27a198f8542456378f865032765378b8 +Subproject commit b8abd200a1a83ce4592f7ad3290d07ae02b4f538 From ea3b49b188504be1b13dad19a8751d762beb2aed Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:35:26 +0700 Subject: [PATCH 020/149] External memory allocation --- include/nbl/video/EApiType.h | 8 ++ include/nbl/video/IDeviceMemoryAllocation.h | 66 +++++++---- include/nbl/video/IDeviceMemoryAllocator.h | 46 ++++++-- src/nbl/video/CVulkanLogicalDevice.cpp | 122 ++++++++++++++++++-- src/nbl/video/CVulkanMemoryAllocation.cpp | 9 +- src/nbl/video/CVulkanMemoryAllocation.h | 7 +- src/nbl/video/IDeviceMemoryAllocation.cpp | 2 +- src/nbl/video/utilities/CAssetConverter.cpp | 13 +-- 8 files changed, 214 insertions(+), 59 deletions(-) diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index e670dc90d8..3e86c8d040 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,6 +13,14 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; +using ExternalHandleType = +#ifdef _WIN32 +void* +#else +int +#endif +; + } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 46f77975fb..8de6bd4fa8 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -24,6 +24,8 @@ We only support persistently mapped buffers with ARB_buffer_storage. Please don't ask us to support Buffer Orphaning. */ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { + friend class IDeviceMemoryAllocator; + friend class ILogicalDevice; public: //! Access flags for how the application plans to use mapped memory (if any) /** When you create the memory you can allow for it to be mapped (be given a pointer) @@ -88,26 +90,26 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted E_API_TYPE getAPIType() const; //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource. - inline bool isDedicated() const {return m_dedicated;} + inline bool isDedicated() const {return m_params.dedicated;} //! Returns the size of the memory allocation - inline size_t getAllocationSize() const {return m_allocationSize;} + inline size_t getAllocationSize() const {return m_params.allocationSize;} //! - inline core::bitflag getAllocateFlags() const { return m_allocateFlags; } + inline core::bitflag getAllocateFlags() const { return m_params.allocateFlags; } //! - inline core::bitflag getMemoryPropertyFlags() const { return m_memoryPropertyFlags; } + inline core::bitflag getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr) - inline bool isMappable() const {return m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)||m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} + inline bool isMappable() const {return m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)|| m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} //! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor /** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more) for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */ // TODO: Visible is a misnomer, collides with Vulkan memory model nomenclature where visibility only concerns reads, where as this is both read and write (visibility and availability) inline bool haveToMakeVisible() const { - return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); + return !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); } //! @@ -123,9 +125,9 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { if (isCurrentlyMapped()) return nullptr; - if(accessHint.hasFlags(EMCAF_READ) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) + if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) return nullptr; - if(accessHint.hasFlags(EMCAF_WRITE) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) + if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) return nullptr; m_mappedPtr = reinterpret_cast(map_impl(range,accessHint)); if (m_mappedPtr) @@ -166,29 +168,53 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Constant variant of getMappedPointer inline const void* getMappedPointer() const { return m_mappedPtr; } + struct SInfo + { + uint64_t allocationSize = 0; + core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; + // Handle Type for external resources + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE + ExternalHandleType externalHandle = 0; + }; + + struct SCreationParams: SInfo + { + core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; + const bool dedicated = false; + }; + + inline const SCreationParams& getCreationParams() const { return m_params; } + protected: - inline IDeviceMemoryAllocation( - const ILogicalDevice* const originDevice, const size_t _size, const core::bitflag allocateFlags, const core::bitflag memoryPropertyFlags, const bool dedicated - ) : m_originDevice(originDevice), m_allocationSize(_size), m_allocateFlags(allocateFlags), m_memoryPropertyFlags(memoryPropertyFlags), m_dedicated(dedicated) {} + inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) + { + m_postDestroyCleanup = std::move(cleanup); + } + + IDeviceMemoryAllocation( + const ILogicalDevice* originDevice, SCreationParams&& params = {}) + : m_originDevice(originDevice) + , m_params(std::move(params)) + , m_mappedPtr(nullptr) + , m_mappedRange{ 0, 0 } + , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS) + {} virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; virtual bool unmap_impl() = 0; - - const ILogicalDevice* const m_originDevice; - const size_t m_allocationSize; + const ILogicalDevice* m_originDevice = nullptr; + SCreationParams m_params = {}; uint8_t* m_mappedPtr = nullptr; MemoryRange m_mappedRange = {}; core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - const core::bitflag m_allocateFlags; - const core::bitflag m_memoryPropertyFlags; - const bool m_dedicated; + std::unique_ptr m_postDestroyCleanup = nullptr; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) } // end namespace nbl::video -#endif - - +#endif \ No newline at end of file diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index e85eec12a0..9201d3f849 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -15,11 +15,9 @@ class NBL_API2 IDeviceMemoryAllocator // right now we only support this interface handing out memory for one device or group virtual ILogicalDevice* getDeviceForAllocations() const = 0; - struct SAllocateInfo + struct SAllocateInfo : IDeviceMemoryAllocation::SInfo { - size_t size : 54 = 0ull; - size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS - size_t memoryTypeIndex : 5 = 0u; + size_t memoryTypeIndex = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. }; @@ -45,8 +43,15 @@ class NBL_API2 IDeviceMemoryAllocator class IMemoryTypeIterator { public: - IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) - : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs) {} + IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + ExternalHandleType handle) : + m_allocateFlags(static_cast(allocateFlags.value)), + m_reqs(reqs), + m_handleType(handleType), + m_handle(handle) + {} static inline uint32_t end() {return 32u;} @@ -59,10 +64,12 @@ class NBL_API2 IDeviceMemoryAllocator inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) { SAllocateInfo ret; - ret.size = m_reqs.size; - ret.flags = m_allocateFlags; + ret.allocationSize = m_reqs.size; + ret.allocateFlags = core::bitflag(m_allocateFlags); ret.memoryTypeIndex = dereference(); ret.dedication = dedication; + ret.externalHandleType = m_handleType; + ret.externalHandle = m_handle; return ret; } @@ -75,13 +82,21 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; + ExternalHandleType m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB class DefaultMemoryTypeIterator : public IMemoryTypeIterator { public: - DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) : IMemoryTypeIterator(reqs, allocateFlags) + DefaultMemoryTypeIterator( + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + ExternalHandleType handle + ) : + IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } @@ -106,15 +121,22 @@ class NBL_API2 IDeviceMemoryAllocator template inline SAllocation allocate( - const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, IDeviceMemoryBacked* dedication=nullptr, - const core::bitflag allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE) + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + IDeviceMemoryBacked* dedication = nullptr, + const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, + ExternalHandleType externalHandle = {}, + std::unique_ptr&& postDestroyCleanup = nullptr) { - for(memory_type_iterator_t memTypeIt(reqs, allocateFlags); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); auto allocation = allocate(allocateInfo); if (allocation.isValid()) + { + allocation.memory->setPostDestroyCleanup(std::move(postDestroyCleanup)); return allocation; + } } return {}; } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 32e9ac2022..6d5f896765 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -136,26 +136,85 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } +ExternalHandleType DupeHandle(uint64_t pid, ExternalHandleType handle) +{ +#ifdef _WIN32 + HANDLE re = 0; + + HANDLE cur = GetCurrentProcess(); + HANDLE src = pid ? OpenProcess(GENERIC_ALL, false, pid) : cur; + + if (!DuplicateHandle(src, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return 0; + + CloseHandle(src); + return re; +#endif + return handle; +} IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { - IDeviceMemoryAllocator::SAllocation ret = {}; if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) - return ret; + return {}; - const core::bitflag allocateFlags(info.flags); VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr }; { - if (allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) + if (info.allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) vk_allocateFlagsInfo.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now } VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr}; + +#ifdef _WIN32 + VkImportMemoryWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .handle = info.externalHandle + }; + + VkExportMemoryWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL, + }; +#else + VkImportMemoryFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .fd = (int)info.externalHandle, + }; +#endif + + VkExportMemoryAllocateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif + .handleTypes = static_cast(info.externalHandleType), + }; + + const void** pNext = &vk_allocateFlagsInfo.pNext; + + if (info.externalHandleType) + { + if (info.externalHandle) //importing + { + auto duped = DupeHandle(0, info.externalHandle); + const_cast(info.externalHandle) = duped; + *pNext = &importInfo; + } + else // exporting + *pNext = &exportInfo; + pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext; + } + if(info.dedication) { // VK_KHR_dedicated_allocation is in core 1.1, no querying for support needed static_assert(MinimumVulkanApiVersion >= VK_MAKE_API_VERSION(0,1,1,0)); - vk_allocateFlagsInfo.pNext = &vk_dedicatedInfo; + *pNext = &vk_dedicatedInfo; + pNext = &vk_dedicatedInfo.pNext; + switch (info.dedication->getObjectType()) { case IDeviceMemoryBacked::EOT_BUFFER: @@ -166,22 +225,65 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca break; default: assert(false); - return ret; + return {}; break; } } VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo}; - vk_allocateInfo.allocationSize = info.size; + vk_allocateInfo.allocationSize = info.allocationSize; vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; VkDeviceMemory vk_deviceMemory; auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory); if (vk_res!=VK_SUCCESS) - return ret; + return {}; + + const bool exported = info.externalHandleType && !info.externalHandle; + + if (exported) + { +#ifdef _WIN32 + VkMemoryGetWin32HandleInfoKHR +#else + VkMemoryGetFdInfoKHR +#endif + handleInfo = { .sType = +#ifdef _WIN32 + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#else + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#endif + .memory = vk_deviceMemory, + .handleType = static_cast(info.externalHandleType), + }; + + /* + For handle types defined as NT handles, + the handles returned by vkGetMemoryWin32HandleKHR are owned by the application + and hold a reference to their payload. To avoid leaking resources, + the application must release ownership of them + using the CloseHandle system call when they are no longer needed. + */ + + if (VK_SUCCESS != m_devf.vk. +#ifdef _WIN32 + vkGetMemoryWin32HandleKHR +#else + vkGetMemoryFdKHR +#endif + (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + { + m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); + return {}; + } + + } // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; - ret.memory = core::make_smart_refctd_ptr(this,info.size,allocateFlags,memoryPropertyFlags,info.dedication,vk_deviceMemory); + CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; + IDeviceMemoryAllocator::SAllocation ret = {}; + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) { @@ -554,7 +656,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDesc vkDescSetLayoutBinding.stageFlags = getVkShaderStageFlagsFromShaderStage(binding.stageFlags); vkDescSetLayoutBinding.pImmutableSamplers = nullptr; - if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER or binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) and binding.immutableSamplers and binding.count) + if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER || binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) && binding.immutableSamplers && binding.count) { // If descriptorType is VK_DESCRIPTOR_TYPE_SAMPLER or VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, and descriptorCount is not 0 and pImmutableSamplers is not NULL: // pImmutableSamplers must be a valid pointer to an array of descriptorCount valid VkSampler handles. diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 5a4dfd5ff5..8f50c29939 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -4,11 +4,10 @@ namespace nbl::video { CVulkanMemoryAllocation::CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle -) : IDeviceMemoryAllocation(dev,size,flags,memoryPropertyFlags,isDedicated), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params +) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 470e914ae3..22e32142c0 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -15,10 +15,9 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation { public: CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } diff --git a/src/nbl/video/IDeviceMemoryAllocation.cpp b/src/nbl/video/IDeviceMemoryAllocation.cpp index 058f391de1..5f05e8d928 100644 --- a/src/nbl/video/IDeviceMemoryAllocation.cpp +++ b/src/nbl/video/IDeviceMemoryAllocation.cpp @@ -14,7 +14,7 @@ IDeviceMemoryAllocation::MemoryRange IDeviceMemoryAllocation::alignNonCoherentRa { const auto alignment = m_originDevice->getPhysicalDevice()->getLimits().nonCoherentAtomSize; range.offset = core::alignDown(range.offset,alignment); - range.length = core::min(core::alignUp(range.length,alignment),m_allocationSize); + range.length = core::min(core::alignUp(range.length,alignment),m_params.allocationSize); return range; } diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d7f2d7dbbc..4a5890c4b7 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2459,12 +2459,11 @@ class MetaDeviceMemoryAllocator final failures.reserve(binItemCount); // ... using allocate_flags_t = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS; - IDeviceMemoryAllocator::SAllocateInfo info = { - .size = 0xdeadbeefBADC0FFEull, // set later - .flags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT:allocate_flags_t::EMAF_NONE, - .memoryTypeIndex = memTypeIx, - .dedication = nullptr - }; + IDeviceMemoryAllocator::SAllocateInfo info; + info.allocationSize = 0xdeadbeefBADC0FFEull; // set later + info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; + info.memoryTypeIndex = memTypeIx; + info.dedication = nullptr; // allocate in progression of combined allocations, while trying allocate as much as possible in a single allocation auto binItemsIt = binItems.begin(); for (auto firstOffsetIt=offsetsTmp.begin(); firstOffsetIt!=offsetsTmp.end(); ) @@ -2473,7 +2472,7 @@ class MetaDeviceMemoryAllocator final const size_t combinedCount = std::distance(firstOffsetIt,nextOffsetIt); const size_t lastIx = combinedCount-1; // if we take `combinedCount` starting at `firstItem` their allocation would need this size - info.size = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; + info.allocationSize = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; auto allocation = m_allocator->allocate(info); if (allocation.isValid()) { From 77b92ab7e66fbb659491248bdaa562ae59041fc2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:48:57 +0700 Subject: [PATCH 021/149] Fix indentation on CAssetConverter.cpp --- src/nbl/video/utilities/CAssetConverter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 4a5890c4b7..06bab99dd4 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2461,7 +2461,7 @@ class MetaDeviceMemoryAllocator final using allocate_flags_t = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS; IDeviceMemoryAllocator::SAllocateInfo info; info.allocationSize = 0xdeadbeefBADC0FFEull; // set later - info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; + info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; info.memoryTypeIndex = memTypeIx; info.dedication = nullptr; // allocate in progression of combined allocations, while trying allocate as much as possible in a single allocation From 68f740fa813fa2a712ed53d2d5c04462e0401a67 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:49:21 +0700 Subject: [PATCH 022/149] Update jitify submodule --- 3rdparty/jitify | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/jitify b/3rdparty/jitify index 0d6dbd8ccd..1a0ca0e837 160000 --- a/3rdparty/jitify +++ b/3rdparty/jitify @@ -1 +1 @@ -Subproject commit 0d6dbd8ccd07e6bfc811d363a54912dfc6d4799a +Subproject commit 1a0ca0e837405506f3b8f7883bacb71c20d86d96 From 1c93a9157d5fa9d02415ebaa843749664e8ec209 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:52:07 +0700 Subject: [PATCH 023/149] External memory allocation cleanup --- src/nbl/video/CVulkanMemoryAllocation.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 8f50c29939..c817213700 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -11,6 +11,11 @@ CVulkanMemoryAllocation::CVulkanMemoryAllocation( CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { + if (m_params.externalHandle) + { + bool re = CloseHandle(getCreationParams().externalHandle); + assert(re); + } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); } From ae0e177f14e5b57e313a09b43edeb131aded2eaa Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 18:01:09 +0700 Subject: [PATCH 024/149] Implement proper CCUDADevice destructor. --- include/nbl/video/CCUDADevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index b204b98b23..047680ba9c 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -183,7 +183,7 @@ class CCUDADevice : public core::IReferenceCounted protected: friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); - ~CCUDADevice() = default; + ~CCUDADevice(); std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index bf96c6e78d..79ba9c2c7a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -143,6 +143,11 @@ CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* } #endif +CCUDADevice::~CCUDADevice() +{ + m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); +} + } #endif // _NBL_COMPILE_WITH_CUDA_ From c83942a771a812ad5664854fc9462f5298e7f4e8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Mar 2026 18:43:18 +0700 Subject: [PATCH 025/149] Implementation of Shared memory between vulkan and cuda --- include/nbl/video/CCUDADevice.h | 24 +++++- include/nbl/video/CCUDASharedMemory.h | 74 ++++++++++++++++++ src/nbl/CMakeLists.txt | 1 + src/nbl/video/CCUDADevice.cpp | 102 ++++++++++++++++++++++++- src/nbl/video/CCUDASharedMemory.cpp | 105 ++++++++++++++++++++++++++ 5 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 include/nbl/video/CCUDASharedMemory.h create mode 100644 src/nbl/video/CCUDASharedMemory.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 047680ba9c..62c3360d1e 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -6,6 +6,7 @@ #include "nbl/video/IPhysicalDevice.h" +#include "nbl/video/CCUDASharedMemory.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -26,7 +27,22 @@ class CCUDAHandler; class CCUDADevice : public core::IReferenceCounted { - public: + public: +#ifdef _WIN32 + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; +#else + static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD; + static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif + + struct SCUDACleaner : video::ICleanup + { + core::smart_refctd_ptr resource; + SCUDACleaner(core::smart_refctd_ptr resource) + : resource(std::move(resource)) + {} + }; enum E_VIRTUAL_ARCHITECTURE { EVA_30, @@ -180,6 +196,11 @@ class CCUDADevice : public core::IReferenceCounted static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); #endif + CUdevice getInternalObject() const { return m_handle; } + const CCUDAHandler* getHandler() const { return m_handler.get(); } + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + size_t roundToGranularity(CUmemLocationType location, size_t size) const; + CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); protected: friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); @@ -193,6 +214,7 @@ class CCUDADevice : public core::IReferenceCounted core::smart_refctd_ptr m_handler; CUdevice m_handle; CUcontext m_context; + size_t m_allocationGranularity[4]; }; } diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h new file mode 100644 index 0000000000..f133dadd81 --- /dev/null +++ b/include/nbl/video/CCUDASharedMemory.h @@ -0,0 +1,74 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ + + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class CCUDAMemoryMapping: public core::IReferenceCounted +{ +}; + +class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted +{ +public: + friend class CCUDADevice; + + CUdeviceptr getDeviceptr() const { return m_params.ptr; } + + struct SCreationParams + { + size_t size; + uint32_t alignment; + CUmemLocationType location; + }; + + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + CUdeviceptr ptr; + union + { + void* osHandle; + int fd; + }; + }; + + const SCreationParams& getCreationParams() const { return m_params; } + + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + core::smart_refctd_ptr exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; + +protected: + + CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) + : m_device(std::move(device)) + , m_params(std::move(params)) + {} + ~CCUDASharedMemory() override; + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index a680a19eab..09d3587e1d 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -294,6 +294,7 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp + video/CCUDASharedMemory.cpp ) set(NBL_SCENE_SOURCES diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 79ba9c2c7a..9dc3908b6b 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -18,7 +18,8 @@ CCUDADevice::CCUDADevice( m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture), m_handle(_device), - m_handler(std::move(_handler)) + m_handler(std::move(_handler)), + m_allocationGranularity{} { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); @@ -31,6 +32,20 @@ CCUDADevice::CCUDADevice( assert(CUDA_SUCCESS == re); re = cu.pcuCtxSetCurrent(m_context); assert(CUDA_SUCCESS == re); + + for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) + { + uint32_t metaData[16] = { 48 }; + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = {.type = static_cast(i), .id = m_handle }, + .win32HandleMetaData = metaData, + }; + auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + + assert(CUDA_SUCCESS == re); + } } @@ -143,6 +158,91 @@ CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* } #endif +size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const +{ + return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; +} + +CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +{ + auto& cu = m_handler->getCUDAFunctionTable(); + + CUdeviceptr ptr = 0; + if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) + return err; + + if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) + { + cu.pcuMemAddressFree(ptr, size); + return err; + } + + CUmemAccessDesc accessDesc = { + .location = { .type = location, .id = m_handle }, + .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, + }; + + if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) + { + cu.pcuMemUnmap(ptr, size); + cu.pcuMemAddressFree(ptr, size); + return err; + } + + *outPtr = ptr; + + return CUDA_SUCCESS; +} + +CUresult CCUDADevice::createSharedMemory( + core::smart_refctd_ptr* outMem, + CCUDASharedMemory::SCreationParams&& inParams) +{ + if (!outMem) + return CUDA_ERROR_INVALID_VALUE; + + CCUDASharedMemory::SCachedCreationParams params = { inParams }; + + auto& cu = m_handler->getCUDAFunctionTable(); + + uint32_t metaData[16] = { 48 }; + + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = params.location, .id = m_handle }, + .win32HandleMetaData = metaData, + }; + + params.granularSize = roundToGranularity(params.location, params.size); + + CUmemGenericAllocationHandle mem; + if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) + return err; + + if (auto err = cu.pcuMemExportToShareableHandle(¶ms.osHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) + { + cu.pcuMemRelease(mem); + return err; + } + + if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + { + CloseHandle(params.osHandle); + cu.pcuMemRelease(mem); + return err; + } + + if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) + { + CloseHandle(params.osHandle); + return err; + } + + *outMem = core::smart_refctd_ptr(new CCUDASharedMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + + return CUDA_SUCCESS; +} CCUDADevice::~CCUDADevice() { m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp new file mode 100644 index 0000000000..93ab6f4c48 --- /dev/null +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -0,0 +1,105 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ + +core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +{ + auto pd = device->getPhysicalDevice(); + uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; + uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); + + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; + case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; + // TODO(Atil): Figure out how to handle these + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + default: break; + } + + IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; + req.size = m_params.granularSize; + req.memoryTypeBits = memoryTypeBits; + req.prefersDedicatedAllocation = nullptr != dedication; + req.requiresDedicatedAllocation = nullptr != dedication; + + return device->allocate(req, + dedication, + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + m_params.osHandle, + std::make_unique(core::smart_refctd_ptr(this))).memory; +} + +#if 0 +core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag usage) const +{ + if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) + return nullptr; + + auto buf = device->createBuffer({{ + .size = m_params.granularSize, + .usage = usage }, {{ + .postDestroyCleanup = std::make_unique(core::smart_refctd_ptr(this)), + .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + .externalHandle = m_params.osHandle + }}}); + + auto req = buf->getMemoryReqs(); + auto pd = device->getPhysicalDevice(); + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break; + case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break; + // TODO(Atil): Figure out how to handle these + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + default: break; + } + + if (!device->allocate(req, buf.get()).isValid()) + return nullptr; + + return buf; +} + +#endif + +core::smart_refctd_ptr CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const +{ + if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) + return nullptr; + + // auto img = device->createImage({ + // std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }}, + // IGPUImage::ET_LINEAR, + // IGPUImage::EL_PREINITIALIZED, + // }); + // + // if (exportAsMemory(device, img.get())) + // return img; + + return nullptr; +} + +CCUDASharedMemory::~CCUDASharedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + + CUresult re[] = { + cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), + }; + CloseHandle(m_params.osHandle); + +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file From 2e457029b6318732a22f958b2422317269d296e1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Mar 2026 18:43:57 +0700 Subject: [PATCH 026/149] Add NBL_API2 modifier to CCUDADevice --- include/nbl/video/CCUDADevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 62c3360d1e..7668bb2ea5 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -25,7 +25,7 @@ namespace nbl::video { class CCUDAHandler; -class CCUDADevice : public core::IReferenceCounted +class NBL_API2 CCUDADevice : public core::IReferenceCounted { public: #ifdef _WIN32 From 741252f1964ba2f807f260762fbfe37a63b259f9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Mar 2026 18:46:09 +0700 Subject: [PATCH 027/149] Implementation of Shared semaphore between Vulkan and CUDA --- include/nbl/video/CCUDADevice.h | 6 +++ include/nbl/video/CCUDASharedSemaphore.h | 52 ++++++++++++++++++++++++ include/nbl/video/ILogicalDevice.h | 2 +- include/nbl/video/ISemaphore.h | 34 +++++++++++++++- src/nbl/CMakeLists.txt | 1 + src/nbl/video/CCUDADevice.cpp | 26 ++++++++++++ src/nbl/video/CCUDASharedSemaphore.cpp | 19 +++++++++ src/nbl/video/CVulkanLogicalDevice.cpp | 40 +++++++++++++++--- src/nbl/video/CVulkanLogicalDevice.h | 2 +- src/nbl/video/CVulkanSemaphore.cpp | 9 +++- src/nbl/video/CVulkanSemaphore.h | 4 +- 11 files changed, 183 insertions(+), 12 deletions(-) create mode 100644 include/nbl/video/CCUDASharedSemaphore.h create mode 100644 src/nbl/video/CCUDASharedSemaphore.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 7668bb2ea5..e80bd18138 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -7,6 +7,7 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDASharedSemaphore.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -201,7 +202,12 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + protected: + CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); ~CCUDADevice(); diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h new file mode 100644 index 0000000000..6c69f75438 --- /dev/null +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -0,0 +1,52 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted +{ +public: + friend class CCUDADevice; + + CUexternalSemaphore getInternalObject() const { return m_handle; } + +protected: + + CCUDASharedSemaphore(core::smart_refctd_ptr device, + core::smart_refctd_ptr src, + CUexternalSemaphore semaphore, + ExternalHandleType osHandle) + : m_device(std::move(device)) + , m_src(std::move(m_src)) + , m_handle(semaphore) + , m_osHandle(osHandle) + {} + ~CCUDASharedSemaphore() override; + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalSemaphore m_handle; + ExternalHandleType m_osHandle; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 1c7393bb57..9f2c589172 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -162,7 +162,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe IQueue::RESULT waitIdle(); //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) = 0; + virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, SCreationParams&& creationParams) = 0; // Waits for max timeout amout of time for the semaphores to reach a specific counter value // DOES NOT implicitly trigger Queue-refcount-resource release because of two reasons: // - the events may trigger loads of resource releases causing extra processing, whereas our `timeout` could be quite small diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index d4fbdd1756..67a093f9d3 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -15,6 +15,34 @@ namespace nbl::video class ISemaphore : public IBackendObject { public: + + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0x00000000, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D12_FENCE = 0x00000008, + EHT_SYNC_FD = 0x00000010, + }; + + //! + struct SCreationParams + { + // A Pre-Destroy-Step is called out just before a `vkDestory` or `glDelete`, this is only useful for "imported" resources + std::unique_ptr preDestroyCleanup = nullptr; + // A Post-Destroy-Step is called in this class' destructor, this is only useful for "imported" resources + std::unique_ptr postDestroyCleanup = nullptr; + // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects + bool skipHandleDestroy = false; + // Handle Type for external resources + core::bitflag externalHandleTypes = EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE + ExternalHandleType externalHandle = nullptr; + }; + // basically a pool function virtual uint64_t getCounterValue() const = 0; @@ -146,9 +174,13 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; + const SCreationParams& getCreationParams() const { return m_creationParams; } + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev) : IBackendObject(std::move(dev)) {} + inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} virtual ~ISemaphore() = default; + + SCreationParams m_creationParams; }; } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 09d3587e1d..bbec1b1691 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -294,6 +294,7 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp + video/CCUDASharedSemaphore.cpp video/CCUDASharedMemory.cpp ) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 9dc3908b6b..b7313b80bf 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -243,6 +243,32 @@ CUresult CCUDADevice::createSharedMemory( return CUDA_SUCCESS; } + +CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +{ + if (!sema || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + + auto& cu = m_handler->getCUDAFunctionTable(); + auto handleType = sema->getCreationParams().externalHandleTypes.value; + auto handle = sema->getCreationParams().externalHandle; + + if (!handleType || !handle) + return CUDA_ERROR_INVALID_VALUE; + + CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { + .type = static_cast(handleType), + .handle = {.win32 = {.handle = handle }}, + }; + + CUexternalSemaphore cusema; + if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + return err; + + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, handle), core::dont_grab); + return CUDA_SUCCESS; +} + CCUDADevice::~CCUDADevice() { m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp new file mode 100644 index 0000000000..049f93ac13 --- /dev/null +++ b/src/nbl/video/CCUDASharedSemaphore.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDASharedSemaphore.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ +CCUDASharedSemaphore::~CCUDASharedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + cu.pcuDestroyExternalSemaphore(m_handle); + CloseHandle(m_osHandle); +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 6d5f896765..cd49be13cd 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -56,10 +56,24 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams) { + + // TODO(kevin) : Handle importing external semaphore into Vulkan + // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; + + VkExportSemaphoreWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, + .dwAccess = /*DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE*/0x80000000L | 1 + }; + VkExportSemaphoreCreateInfo exportInfo = { + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, + &handleInfo, + static_cast(creationParams.externalHandleTypes.value) + }; + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; - type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR + type.pNext = creationParams.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR, or VkSemaphoreTypeCreateInfo type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; type.initialValue = initialValue; @@ -67,11 +81,27 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u createInfo.flags = static_cast(0); // flags must be 0 VkSemaphore semaphore; - if (m_devf.vk.vkCreateSemaphore(m_vkdev,&createInfo,nullptr,&semaphore)==VK_SUCCESS) - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this),semaphore); - else + if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; + + if (creationParams.externalHandleTypes.value) + { + VkSemaphoreGetWin32HandleInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = semaphore, + .handleType = static_cast(creationParams.externalHandleTypes.value), + }; + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &creationParams.externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); + return nullptr; + } + } + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore); + } + ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) { using retval_t = ISemaphore::WAIT_RESULT; diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 8f43a6783a..09213f28db 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -53,7 +53,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice CVulkanLogicalDevice(core::smart_refctd_ptr&& api, renderdoc_api_t* const rdoc, const IPhysicalDevice* const physicalDevice, const VkDevice vkdev, const SCreationParams& params); // sync stuff - core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) override; + core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) override; ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 071c4b2843..792d1f27f1 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -7,8 +7,13 @@ namespace nbl::video CVulkanSemaphore::~CVulkanSemaphore() { - const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - vulkanDevice->getFunctionTable()->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + m_creationParams.preDestroyCleanup = nullptr; + if (!m_creationParams.skipHandleDestroy) + { + const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); + auto* vk = vulkanDevice->getFunctionTable(); + vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + } } uint64_t CVulkanSemaphore::getCounterValue() const diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 9290110d8d..cc5d15d3f4 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,8 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore) + : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore) {} ~CVulkanSemaphore(); uint64_t getCounterValue() const override; From fe75ce017e64279203c9aaa7a5c1e79a5264a691 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:31:26 +0700 Subject: [PATCH 028/149] Update to CUDA Toolkit version 13.0+ --- include/nbl/video/CCUDAHandler.h | 4 ++-- src/nbl/video/CCUDADevice.cpp | 2 +- src/nbl/video/CCUDAHandler.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index ef040f5536..9de55914b5 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -39,7 +39,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted // using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v2 + ,cuCtxCreate_v4 ,cuDevicePrimaryCtxRetain ,cuDevicePrimaryCtxRelease ,cuDevicePrimaryCtxSetFlags @@ -62,7 +62,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuDeviceGet ,cuDeviceGetAttribute ,cuDeviceGetLuid - ,cuDeviceGetUuid + ,cuDeviceGetUuid_v2 ,cuDeviceTotalMem_v2 ,cuDeviceGetName ,cuDriverGetVersion diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index b7313b80bf..3b8ea3bee8 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -28,7 +28,7 @@ CCUDADevice::CCUDADevice( auto& cu = m_handler->getCUDAFunctionTable(); - CUresult re = cu.pcuCtxCreate_v2(&m_context, 0, m_handle); + CUresult re = cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle); assert(CUDA_SUCCESS == re); re = cu.pcuCtxSetCurrent(m_context); assert(CUDA_SUCCESS == re); diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 0eba770c89..c1044dd894 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -42,7 +42,7 @@ CCUDAHandler::CCUDAHandler( continue; CUuuid uuid = {}; - if (m_cuda.pcuDeviceGetUuid(&uuid, handle) != CUDA_SUCCESS) + if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) continue; m_availableDevices.emplace_back(handle, uuid); From 78fc0df8c1c15edb88c2f3b48467b296681a0ddc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:38:15 +0700 Subject: [PATCH 029/149] Fix external semaphore --- include/nbl/video/CCUDADevice.h | 5 ++++- include/nbl/video/ILogicalDevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 8 +++++++- src/nbl/video/CVulkanLogicalDevice.cpp | 6 +----- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index e80bd18138..3d40ebff25 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -194,7 +194,10 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); + static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, + + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); +uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); #endif CUdevice getInternalObject() const { return m_handle; } diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 9f2c589172..1dce5e7091 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -162,7 +162,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe IQueue::RESULT waitIdle(); //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) = 0; // Waits for max timeout amout of time for the semaphores to reach a specific counter value // DOES NOT implicitly trigger Queue-refcount-resource release because of two reasons: // - the events may trigger loads of resource releases causing extra processing, whereas our `timeout` could be quite small diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 3b8ea3bee8..ac25bb234a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -256,8 +256,14 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(handleType), + .type = EXTERNAL_SEMAPHORE_HANDLE_TYPE, .handle = {.win32 = {.handle = handle }}, }; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index cd49be13cd..dab9862964 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -62,13 +62,9 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u // TODO(kevin) : Handle importing external semaphore into Vulkan // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; - VkExportSemaphoreWin32HandleInfoKHR handleInfo = { - .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, - .dwAccess = /*DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE*/0x80000000L | 1 - }; VkExportSemaphoreCreateInfo exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, - &handleInfo, + nullptr, static_cast(creationParams.externalHandleTypes.value) }; From 5d19c5bd2be94d2bead2e6f0c6f35108f6495150 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:39:54 +0700 Subject: [PATCH 030/149] External image implementation --- include/nbl/video/ILogicalDevice.h | 16 +-------- include/nbl/video/IPhysicalDevice.h | 16 +++++++++ src/nbl/video/CVulkanLogicalDevice.cpp | 10 +++++- src/nbl/video/CVulkanPhysicalDevice.cpp | 39 ++++++++++++++++++++++ src/nbl/video/CVulkanPhysicalDevice.h | 4 ++- src/nbl/video/ILogicalDevice.cpp | 44 +++++++++++++++++++++++++ 6 files changed, 112 insertions(+), 17 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 1dce5e7091..d6d2f8530a 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -335,21 +335,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) - inline core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams) - { - if (!IGPUImage::validateCreationParameters(creationParams)) - { - m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs - return createImage_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams); // Create an ImageView that can actually be used by shaders (@see ICPUImageView) inline core::smart_refctd_ptr createImageView(IGPUImageView::SCreationParams&& params) { diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 2ae58f22e3..f8550debce 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -676,6 +676,21 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return getExternalMemoryProperties_impl(usages, handleType); } + struct SImageFormatInfo + { + asset::E_FORMAT format; + IGPUImage::E_TYPE type; + IGPUImage::TILING tiling; + core::bitflag usage; + core::bitflag flags; + }; + SExternalMemoryProperties getExternalImageProperties( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + return getExternalMemoryProperties_impl(info, handleType); + } + struct SBufferFormatPromotionRequest { asset::E_FORMAT originalFormat = asset::EF_UNKNOWN; SFormatBufferUsages::SUsage usages = SFormatBufferUsages::SUsage(); @@ -722,6 +737,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable // External memory properties query virtual SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; // ILogicalDevice creation bool validateLogicalDeviceCreation(const ILogicalDevice::SCreationParams& params) const; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index dab9862964..c0df8fd9f4 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -488,7 +488,14 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast(fmt)); vk_formatListStruct.pViewFormats = vk_formatList.data(); + const bool external = params.externalHandleTypes.value; + VkExternalMemoryImageCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, + .handleTypes = params.externalHandleTypes.value, + }; + VkImageCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, &vk_formatListStruct }; + vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(params.flags.value); vk_createInfo.imageType = static_cast(params.type); vk_createInfo.format = getVkFormatFromFormat(params.format); @@ -506,7 +513,8 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = params.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices; - vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED:VK_IMAGE_LAYOUT_UNDEFINED; + // The Vulkan spec states: If the pNext chain includes a VkExternalMemoryImageCreateInfo or VkExternalMemoryImageCreateInfoNV structure whose handleTypes member is not 0, initialLayout must be VK_IMAGE_LAYOUT_UNDEFINED + vk_createInfo.initialLayout = external ? VK_IMAGE_LAYOUT_UNDEFINED : (params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED); VkImage vk_image; if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 54e8543668..64dcc24fc4 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1,5 +1,6 @@ #include "nbl/video/CVulkanPhysicalDevice.h" #include "nbl/video/CVulkanLogicalDevice.h" +#include "nbl/video/IGPUImage.h" namespace nbl::video { @@ -1390,6 +1391,44 @@ IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMem }; } +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + VkPhysicalDeviceExternalImageFormatInfo externalImageFormatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO, + .handleType = static_cast(handleType), + }; + + VkPhysicalDeviceImageFormatInfo2 formatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = &externalImageFormatInfo, + .format = getVkFormatFromFormat(info.format), + .type = static_cast(info.type), + .tiling = static_cast(info.tiling), + .usage = getVkImageUsageFlagsFromImageUsageFlags(info.usage.value, asset::isDepthOrStencilFormat(info.format)), + .flags = static_cast(info.flags.value), + }; + + VkExternalImageFormatProperties externalProps = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES, + }; + VkImageFormatProperties2 props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + .pNext = &externalProps, + }; + + auto re = vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &formatInfo, &props); + assert(VK_SUCCESS == re); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) { // We might alter it to account for dependancies. diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index 5cb2556d6e..40e0dd78fe 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,7 +109,9 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } - SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const; + SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; + + SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 01e49a26d2..da883d3974 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -356,6 +356,50 @@ core::smart_refctd_ptr ILogicalDevice::createBufferView(const as return createBufferView_impl(underlying, _fmt); } +core::smart_refctd_ptr ILogicalDevice::createImage(IGPUImage::SCreationParams&& creationParams) +{ + if (!IGPUImage::validateCreationParameters(creationParams)) + { + m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); + return nullptr; + } + if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) + { + m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalImageProperties(IPhysicalDevice::SImageFormatInfo{ + .format = creationParams.format, + .type = creationParams.type, + .tiling = creationParams.tiling, + .usage = creationParams.usage, + .flags = creationParams.flags + }, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + + // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs + return createImage_impl(std::move(creationParams), dedicatedOnly); +} core::smart_refctd_ptr ILogicalDevice::compileShader(const SShaderCreationParameters& creationParams) { From f23b30c87eddbcd021f1f65c772eeb2db5684865 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:40:13 +0700 Subject: [PATCH 031/149] Remove unnecessary inline modifier --- include/nbl/video/ILogicalDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index d6d2f8530a..a3a9b264d0 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -331,7 +331,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); + core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) From e50c85e5f774ecf6289bc3fcc26f357f30327c4a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:21:11 +0700 Subject: [PATCH 032/149] Remove unused code in CCUDADevice --- include/nbl/video/CCUDADevice.h | 115 -------------------------------- src/nbl/video/CCUDADevice.cpp | 110 ------------------------------ 2 files changed, 225 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 3d40ebff25..6b3ab2bbb6 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -85,121 +85,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; } - // TODO/REDO Vulkan: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vulkan-interoperability - // Watch out, use Driver API (`cu` functions) NOT the Runtime API (`cuda` functions) - // Also maybe separate this out into its own `CCUDA` class instead of nesting it here? -#if 0 - template - struct GraphicsAPIObjLink - { - GraphicsAPIObjLink() : obj(nullptr), cudaHandle(nullptr), acquired(false) - { - asImage = {nullptr}; - } - GraphicsAPIObjLink(core::smart_refctd_ptr&& _obj) : GraphicsAPIObjLink() - { - obj = std::move(_obj); - } - GraphicsAPIObjLink(GraphicsAPIObjLink&& other) : GraphicsAPIObjLink() - { - operator=(std::move(other)); - } - - GraphicsAPIObjLink(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(GraphicsAPIObjLink&& other) - { - std::swap(obj,other.obj); - std::swap(cudaHandle,other.cudaHandle); - std::swap(acquired,other.acquired); - std::swap(asImage,other.asImage); - return *this; - } - - ~GraphicsAPIObjLink() - { - assert(!acquired); // you've fucked up, there's no way for us to fix it, you need to release the objects on a proper stream - if (obj) - CCUDAHandler::cuda.pcuGraphicsUnregisterResource(cudaHandle); - } - - // - auto* getObject() const {return obj.get();} - - private: - core::smart_refctd_ptr obj; - CUgraphicsResource cudaHandle; - bool acquired; - - friend class CCUDAHandler; - public: - union - { - struct - { - CUdeviceptr pointer; - } asBuffer; - struct - { - CUmipmappedArray mipmappedArray; - CUarray array; - } asImage; - }; - }; - - // - static CUresult registerBuffer(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - static CUresult registerImage(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - - - template - static CUresult acquireResourcesFromGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsMapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = true; - return retval; - } - template - static CUresult releaseResourcesToGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsUnmapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = false; - return retval; - } - - static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); - static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, - - CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); -uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); -#endif - CUdevice getInternalObject() const { return m_handle; } const CCUDAHandler* getHandler() const { return m_handler.get(); } bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index ac25bb234a..5d1198bb0d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -48,116 +48,6 @@ CCUDADevice::CCUDADevice( } } - -#if 0 -CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink* link, uint32_t flags) -{ - assert(link->obj); - auto glbuf = static_cast(link->obj.get()); - auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags); - if (retval!=CUDA_SUCCESS) - link->obj = nullptr; - return retval; -} -CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink* link, uint32_t flags) -{ - assert(link->obj); - - auto format = link->obj->getCreationParameters().format; - if (asset::isBlockCompressionFormat(format) || asset::isDepthOrStencilFormat(format) || asset::isScaledFormat(format) || asset::isPlanarFormat(format)) - return CUDA_ERROR_INVALID_IMAGE; - - auto glimg = static_cast(link->obj.get()); - GLenum target = glimg->getOpenGLTarget(); - switch (target) - { - case GL_TEXTURE_2D: - case GL_TEXTURE_2D_ARRAY: - case GL_TEXTURE_CUBE_MAP: - case GL_TEXTURE_3D: - break; - default: - return CUDA_ERROR_INVALID_IMAGE; - break; - } - auto retval = cuda.pcuGraphicsGLRegisterImage(&link->cudaHandle,glimg->getOpenGLName(),target,flags); - if (retval != CUDA_SUCCESS) - link->obj = nullptr; - return retval; -} - - -constexpr auto MaxAquireOps = 4096u; - -CUresult CCUDAHandler::acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedPointer_v2(&iit->asBuffer.pointer,outbufferSizes ? sit:&tmp,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -CUresult CCUDAHandler::acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedMipmappedArray(&iit->asImage.mipmappedArray,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsSubResourceGetMappedArray(&iit->asImage.array,iit->cudaHandle,*ait,*mit); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -#endif - size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const { return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; From a9c2d85e192972d524e8382e17d13b229ffada58 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:22:04 +0700 Subject: [PATCH 033/149] Fix importSemaphore for unix --- src/nbl/video/CCUDADevice.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 5d1198bb0d..22421522f3 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -146,17 +146,17 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr Date: Wed, 25 Mar 2026 14:22:52 +0700 Subject: [PATCH 034/149] Remove searching for old nvrtc version --- src/nbl/video/CCUDAHandler.cpp | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index c1044dd894..770db41946 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -455,35 +455,9 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste // Perpetual TODO: any new CUDA releases we need to account for? // Version List: https://developer.nvidia.com/cuda-toolkit-archive const char* nvrtc64_versions[] = { + "nvrtc64_132", "nvrtc64_131", "nvrtc64_130", - "nvrtc64_129", - "nvrtc64_128", - "nvrtc64_126", - "nvrtc64_125", - "nvrtc64_124", - "nvrtc64_123", - "nvrtc64_122", - "nvrtc64_121", - "nvrtc64_120", - "nvrtc64_118", - "nvrtc64_117", - "nvrtc64_116", - "nvrtc64_115", - "nvrtc64_114", - "nvrtc64_113", - "nvrtc64_112", - "nvrtc64_111", - "nvrtc64_110", - "nvrtc64_102", - "nvrtc64_101", - "nvrtc64_100", - "nvrtc64_92", - "nvrtc64_91", - "nvrtc64_90", - "nvrtc64_80", - "nvrtc64_75", - "nvrtc64_70", nullptr }; @@ -523,7 +497,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste int cudaVersion = 0; SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) - if (cudaVersion<9000) + if (cudaVersion<13000) return nullptr; // stop the pollution From c244b77a7feffdec21441c4f75cdc3228f32a3b8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:23:50 +0700 Subject: [PATCH 035/149] Fix filling dstQueueFamilyIndex --- src/nbl/video/CVulkanCommandBuffer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index a04b5940ce..40b20bb5d2 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -90,10 +90,10 @@ void fill(vk_barrier_t& out, const ResourceBarrier& in, uint32_t selfQueueFamily switch (in.ownershipOp) { case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE: - out.dstQueueFamilyIndex = in.otherQueueFamilyIndex; + out.dstQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE: - out.srcQueueFamilyIndex = in.otherQueueFamilyIndex; + out.srcQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; } } From d24acf9fd6181a60d1f22d92da3dc434e1c4aceb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:26:12 +0700 Subject: [PATCH 036/149] Update cuda toolkit requirement in cmake --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0068b002a..5be1855959 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,10 +75,10 @@ option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" ON) if(NBL_COMPILE_WITH_CUDA) find_package(CUDAToolkit REQUIRED) - if(${CUDAToolkit_VERSION} VERSION_GREATER "9.0") - message(STATUS "CUDA version 9.0+ found!") + if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") + message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") else() - message(FATAL_ERROR "CUDA version 9.0+ needed for C++14 support!") + message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!") endif() endif() From ff828003da1f7327af4c399fc3dd0d0bd6d22013 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 15 Apr 2026 17:29:38 +0700 Subject: [PATCH 037/149] Improve external semaphore handle management - Separate cached creation params from creation params in ISemaphore - Move external handle storage to backend-specific classes (e.g., CVulkanSemaphore) - Add virtual getExternalHandle() to ISemaphore interface - Update Vulkan semaphore creation to retrieve and store external handle after creation - Ensure proper cleanup of external handles in CVulkanSemaphore destructor --- include/nbl/video/CCUDASharedSemaphore.h | 2 -- include/nbl/video/ISemaphore.h | 24 +++++++++----------- src/nbl/video/CCUDADevice.cpp | 11 +++++---- src/nbl/video/CCUDASharedSemaphore.cpp | 4 ++-- src/nbl/video/CVulkanLogicalDevice.cpp | 29 ++++++++++++++++++------ src/nbl/video/CVulkanSemaphore.cpp | 16 +++++++++---- src/nbl/video/CVulkanSemaphore.h | 6 +++-- 7 files changed, 56 insertions(+), 36 deletions(-) diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h index 6c69f75438..8a3a73d0b4 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -35,14 +35,12 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) - , m_osHandle(osHandle) {} ~CCUDASharedSemaphore() override; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_src; CUexternalSemaphore m_handle; - ExternalHandleType m_osHandle; }; } diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 67a093f9d3..59886b32cb 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -28,21 +28,14 @@ class ISemaphore : public IBackendObject }; //! - struct SCreationParams + struct SCachedCreationParams { - // A Pre-Destroy-Step is called out just before a `vkDestory` or `glDelete`, this is only useful for "imported" resources - std::unique_ptr preDestroyCleanup = nullptr; - // A Post-Destroy-Step is called in this class' destructor, this is only useful for "imported" resources - std::unique_ptr postDestroyCleanup = nullptr; - // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects - bool skipHandleDestroy = false; // Handle Type for external resources core::bitflag externalHandleTypes = EHT_NONE; - //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE - //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE - ExternalHandleType externalHandle = nullptr; }; + struct SCreationParams : SCachedCreationParams {}; + // basically a pool function virtual uint64_t getCounterValue() const = 0; @@ -174,13 +167,18 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; - const SCreationParams& getCreationParams() const { return m_creationParams; } + virtual ExternalHandleType getExternalHandle() const = 0; + + const SCachedCreationParams& getCreationParams() const { return m_creationParams; } + + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} + inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : + IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} virtual ~ISemaphore() = default; - SCreationParams m_creationParams; + SCachedCreationParams m_creationParams; }; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 22421522f3..a5dcb52d8a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -141,18 +141,18 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptrgetCUDAFunctionTable(); auto handleType = sema->getCreationParams().externalHandleTypes.value; - auto handle = sema->getCreationParams().externalHandle; - if (!handleType || !handle) + if (!handleType) return CUDA_ERROR_INVALID_VALUE; CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { #ifdef _WIN32 .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, - .handle = {.win32 = {.handle = handle }}, + // TODO(kevinyu): Fix this later. Make it compile first. + .handle = {.win32 = {.handle = sema->getExternalHandle() }}, #else .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, - .handle = {.fd = handle} + .handle = {.fd = sema->getExternalHandle()} #endif }; @@ -161,7 +161,8 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, handle), core::dont_grab); + // TODO(kevinyu): Fix the handle parameter later. Make it compile first. + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, {}), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp index 049f93ac13..ae2291035a 100644 --- a/src/nbl/video/CCUDASharedSemaphore.cpp +++ b/src/nbl/video/CCUDASharedSemaphore.cpp @@ -11,8 +11,8 @@ namespace nbl::video CCUDASharedSemaphore::~CCUDASharedSemaphore() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - cu.pcuDestroyExternalSemaphore(m_handle); - CloseHandle(m_osHandle); + if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) + assert(!"Invalid code path."); } } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index c0df8fd9f4..66a0198402 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -62,7 +62,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u // TODO(kevin) : Handle importing external semaphore into Vulkan // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; - VkExportSemaphoreCreateInfo exportInfo = { + VkExportSemaphoreCreateInfo exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, nullptr, static_cast(creationParams.externalHandleTypes.value) @@ -80,22 +80,37 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; - if (creationParams.externalHandleTypes.value) + ExternalHandleType externalHandle = ExternalHandleType{}; + const auto handleType = static_cast(creationParams.externalHandleTypes.value); + if (handleType != 0) { +#ifdef _WIN32 VkSemaphoreGetWin32HandleInfoKHR props = { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, .semaphore = semaphore, - .handleType = static_cast(creationParams.externalHandleTypes.value), + .handleType = handleType, }; - if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &creationParams.externalHandle)) + + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &externalHandle)) { - m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); return nullptr; } +#else + VkSemaphoreGetFdInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .semaphore = vkSemaphore, + .handleType = handleType, + }; + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreFdKHR(m_vkdev, &props, &externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); + return nullptr; + } +#endif } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore); - + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore, externalHandle); } ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 792d1f27f1..958849dae2 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -7,12 +7,18 @@ namespace nbl::video CVulkanSemaphore::~CVulkanSemaphore() { - m_creationParams.preDestroyCleanup = nullptr; - if (!m_creationParams.skipHandleDestroy) + const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); + auto* vk = vulkanDevice->getFunctionTable(); + vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + if (m_creationParams.externalHandleTypes != EHT_NONE) { - const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - auto* vk = vulkanDevice->getFunctionTable(); - vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); +#ifdef _WIN32 + if (!CloseHandle(m_externalHandle)) + assert(!"Invalid code path."); +#else + if (close(m_externalHandle) != 0) + assert(!"Invalid code path."); +#endif } } diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index cc5d15d3f4..3fd4cb82dc 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,8 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const ExternalHandleType externalHandle) + : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore), m_externalHandle(externalHandle) {} ~CVulkanSemaphore(); uint64_t getCounterValue() const override; @@ -24,11 +24,13 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} + ExternalHandleType getExternalHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; private: const VkSemaphore m_semaphore; + const ExternalHandleType m_externalHandle; }; } From 7b486059444fc4146986a0ef5a702fe54202f384 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 15 Apr 2026 18:13:35 +0700 Subject: [PATCH 038/149] Improve win32HandleMetadata parameter so it is more readable --- src/nbl/video/CCUDADevice.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index a5dcb52d8a..9bb5e739f5 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -3,6 +3,8 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDADevice.h" +#include + #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { @@ -95,13 +97,14 @@ CUresult CCUDADevice::createSharedMemory( auto& cu = m_handler->getCUDAFunctionTable(); - uint32_t metaData[16] = { 48 }; + OBJECT_ATTRIBUTES metadata = {}; + metadata.Length = sizeof(OBJECT_ATTRIBUTES); CUmemAllocationProp prop = { .type = CU_MEM_ALLOCATION_TYPE_PINNED, .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, .location = { .type = params.location, .id = m_handle }, - .win32HandleMetaData = metaData, + .win32HandleMetaData = &metadata, }; params.granularSize = roundToGranularity(params.location, params.size); From 24ba36e6736cce8ebc6a87a9eef140b625b31bc7 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 16 Apr 2026 10:46:17 +0700 Subject: [PATCH 039/149] Refactor CCUDASharedMemory to use ExternalHandleType --- include/nbl/video/CCUDASharedMemory.h | 6 +----- src/nbl/video/CCUDADevice.cpp | 6 +++--- src/nbl/video/CCUDASharedMemory.cpp | 4 ++-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index f133dadd81..15de1b72c4 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -42,11 +42,7 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted { size_t granularSize; CUdeviceptr ptr; - union - { - void* osHandle; - int fd; - }; + ExternalHandleType externalHandle; }; const SCreationParams& getCreationParams() const { return m_params; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 9bb5e739f5..738913e709 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -113,7 +113,7 @@ CUresult CCUDADevice::createSharedMemory( if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemExportToShareableHandle(¶ms.osHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) + if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { cu.pcuMemRelease(mem); return err; @@ -121,14 +121,14 @@ CUresult CCUDADevice::createSharedMemory( if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - CloseHandle(params.osHandle); + CloseHandle(params.externalHandle); cu.pcuMemRelease(mem); return err; } if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - CloseHandle(params.osHandle); + CloseHandle(params.externalHandle); return err; } diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 93ab6f4c48..2e58fa4756 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -35,7 +35,7 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsMemor dedication, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - m_params.osHandle, + m_params.externalHandle, std::make_unique(core::smart_refctd_ptr(this))).memory; } @@ -97,7 +97,7 @@ CCUDASharedMemory::~CCUDASharedMemory() CUresult re[] = { cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), }; - CloseHandle(m_params.osHandle); + CloseHandle(m_params.externalHandle); } } From 5b4fc27391b035a0f0ff3d5a22ad1c3d3768ed02 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 17 Apr 2026 14:33:23 +0700 Subject: [PATCH 040/149] Refactor ExternalHandleType --- include/nbl/video/CCUDASharedMemory.h | 2 +- include/nbl/video/CCUDASharedSemaphore.h | 2 +- include/nbl/video/EApiType.h | 32 ++++++++++++++++++++- include/nbl/video/IDeviceMemoryAllocation.h | 2 +- include/nbl/video/IDeviceMemoryAllocator.h | 8 +++--- include/nbl/video/ISemaphore.h | 2 +- src/nbl/video/CCUDADevice.cpp | 4 +-- src/nbl/video/CCUDASharedMemory.cpp | 2 +- src/nbl/video/CVulkanLogicalDevice.cpp | 25 +++------------- src/nbl/video/CVulkanMemoryAllocation.cpp | 2 +- src/nbl/video/CVulkanSemaphore.cpp | 8 +----- src/nbl/video/CVulkanSemaphore.h | 6 ++-- 12 files changed, 51 insertions(+), 44 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 15de1b72c4..20902ac90b 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -42,7 +42,7 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted { size_t granularSize; CUdeviceptr ptr; - ExternalHandleType externalHandle; + external_handle_t externalHandle; }; const SCreationParams& getCreationParams() const { return m_params; } diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h index 8a3a73d0b4..60daec7159 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -31,7 +31,7 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted CCUDASharedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, CUexternalSemaphore semaphore, - ExternalHandleType osHandle) + external_handle_t osHandle) : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 3e86c8d040..0726049200 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,7 +13,7 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; -using ExternalHandleType = +using external_handle_t = #ifdef _WIN32 void* #else @@ -21,6 +21,36 @@ int #endif ; +#ifdef _WIN32 +constexpr external_handle_t ExternalHandleNull = nullptr; +#else +constexpr external_handle_t ExternalHandleNull = -1; +#endif + +inline bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return (close(handle) == 0); +#endif +} + +inline external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE re = ExternalHandleNull; + + const HANDLE cur = GetCurrentProcess(); + if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return re; +#else + return dup(handle); +#endif +} + } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 8de6bd4fa8..cd15039203 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -176,7 +176,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE - ExternalHandleType externalHandle = 0; + external_handle_t externalHandle = 0; }; struct SCreationParams: SInfo diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 9201d3f849..8fc07dd698 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -46,7 +46,7 @@ class NBL_API2 IDeviceMemoryAllocator IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, - ExternalHandleType handle) : + external_handle_t handle) : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs), m_handleType(handleType), @@ -83,7 +83,7 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; - ExternalHandleType m_handle; + external_handle_t m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB @@ -94,7 +94,7 @@ class NBL_API2 IDeviceMemoryAllocator const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, - ExternalHandleType handle + external_handle_t handle ) : IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { @@ -125,7 +125,7 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked* dedication = nullptr, const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, - ExternalHandleType externalHandle = {}, + external_handle_t externalHandle = {}, std::unique_ptr&& postDestroyCleanup = nullptr) { for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 59886b32cb..0edc906b5d 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -167,7 +167,7 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; - virtual ExternalHandleType getExternalHandle() const = 0; + virtual external_handle_t getExternalHandle() const = 0; const SCachedCreationParams& getCreationParams() const { return m_creationParams; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 738913e709..9e572fe119 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -121,14 +121,14 @@ CUresult CCUDADevice::createSharedMemory( if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - CloseHandle(params.externalHandle); + CloseExternalHandle(params.externalHandle); cu.pcuMemRelease(mem); return err; } if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - CloseHandle(params.externalHandle); + CloseExternalHandle(params.externalHandle); return err; } diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 2e58fa4756..22a5ea858a 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -97,7 +97,7 @@ CCUDASharedMemory::~CCUDASharedMemory() CUresult re[] = { cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), }; - CloseHandle(m_params.externalHandle); + CloseExternalHandle(m_params.externalHandle); } } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 66a0198402..5b1e2ec981 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -80,7 +80,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; - ExternalHandleType externalHandle = ExternalHandleType{}; + external_handle_t externalHandle = external_handle_t{}; const auto handleType = static_cast(creationParams.externalHandleTypes.value); if (handleType != 0) { @@ -177,23 +177,6 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } -ExternalHandleType DupeHandle(uint64_t pid, ExternalHandleType handle) -{ -#ifdef _WIN32 - HANDLE re = 0; - - HANDLE cur = GetCurrentProcess(); - HANDLE src = pid ? OpenProcess(GENERIC_ALL, false, pid) : cur; - - if (!DuplicateHandle(src, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) - return 0; - - CloseHandle(src); - return re; -#endif - return handle; -} - IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) @@ -240,8 +223,8 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca { if (info.externalHandle) //importing { - auto duped = DupeHandle(0, info.externalHandle); - const_cast(info.externalHandle) = duped; + auto duped = DuplicateExternalHandle(info.externalHandle); + const_cast(info.externalHandle) = duped; *pNext = &importInfo; } else // exporting @@ -312,7 +295,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca #else vkGetMemoryFdKHR #endif - (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) { m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); return {}; diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index c817213700..f2194756f9 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -11,7 +11,7 @@ CVulkanMemoryAllocation::CVulkanMemoryAllocation( CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { - if (m_params.externalHandle) + if (m_params.externalHandle != ExternalHandleNull) { bool re = CloseHandle(getCreationParams().externalHandle); assert(re); diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 958849dae2..35aefa6ebd 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -12,13 +12,7 @@ CVulkanSemaphore::~CVulkanSemaphore() vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); if (m_creationParams.externalHandleTypes != EHT_NONE) { -#ifdef _WIN32 - if (!CloseHandle(m_externalHandle)) - assert(!"Invalid code path."); -#else - if (close(m_externalHandle) != 0) - assert(!"Invalid code path."); -#endif + CloseExternalHandle(m_externalHandle); } } diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 3fd4cb82dc..12ba147a24 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,7 +15,7 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const ExternalHandleType externalHandle) + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const external_handle_t externalHandle) : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore), m_externalHandle(externalHandle) {} ~CVulkanSemaphore(); @@ -24,13 +24,13 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} - ExternalHandleType getExternalHandle() const override { return m_externalHandle; } + external_handle_t getExternalHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; private: const VkSemaphore m_semaphore; - const ExternalHandleType m_externalHandle; + const external_handle_t m_externalHandle; }; } From fb66f3a83d8a8190258f1559f7baf9345335f3a7 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 17 Apr 2026 14:35:22 +0700 Subject: [PATCH 041/149] Small fix to use CloseExternalHandle --- src/nbl/video/CVulkanMemoryAllocation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index f2194756f9..f2d64eceed 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -13,7 +13,7 @@ CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { if (m_params.externalHandle != ExternalHandleNull) { - bool re = CloseHandle(getCreationParams().externalHandle); + bool re = CloseExternalHandle(getCreationParams().externalHandle); assert(re); } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); From 47ba7e4b6b36448a49bc2d5f5962ce423e8a5026 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:20:15 +0700 Subject: [PATCH 042/149] Remove CCUDASharedMemory::exportAsImage --- include/nbl/video/CCUDASharedMemory.h | 2 -- src/nbl/video/CCUDASharedMemory.cpp | 16 ---------------- 2 files changed, 18 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 20902ac90b..2ce4a8067e 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -49,8 +49,6 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - core::smart_refctd_ptr exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; - protected: CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 22a5ea858a..a5f79a0c72 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -73,22 +73,6 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDev #endif -core::smart_refctd_ptr CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const -{ - if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) - return nullptr; - - // auto img = device->createImage({ - // std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }}, - // IGPUImage::ET_LINEAR, - // IGPUImage::EL_PREINITIALIZED, - // }); - // - // if (exportAsMemory(device, img.get())) - // return img; - - return nullptr; -} CCUDASharedMemory::~CCUDASharedMemory() { From d15d00c58564633352ab6d484e24fef50579c809 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:20:41 +0700 Subject: [PATCH 043/149] Remove unused CCUDASharedMemory::exportAsBuffer --- src/nbl/video/CCUDASharedMemory.cpp | 35 ----------------------------- 1 file changed, 35 deletions(-) diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index a5f79a0c72..34560e5575 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -39,41 +39,6 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsMemor std::make_unique(core::smart_refctd_ptr(this))).memory; } -#if 0 -core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag usage) const -{ - if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) - return nullptr; - - auto buf = device->createBuffer({{ - .size = m_params.granularSize, - .usage = usage }, {{ - .postDestroyCleanup = std::make_unique(core::smart_refctd_ptr(this)), - .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - .externalHandle = m_params.osHandle - }}}); - - auto req = buf->getMemoryReqs(); - auto pd = device->getPhysicalDevice(); - switch (m_params.location) - { - case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break; - case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break; - // TODO(Atil): Figure out how to handle these - case CU_MEM_LOCATION_TYPE_HOST_NUMA: - case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: - default: break; - } - - if (!device->allocate(req, buf.get()).isValid()) - return nullptr; - - return buf; -} - -#endif - - CCUDASharedMemory::~CCUDASharedMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); From ea361894c249ef8b41cf153c8e5eb6de223bba73 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:37:52 +0700 Subject: [PATCH 044/149] Refactor external memory allocation to store the external handle separated from imported handle --- include/nbl/video/CCUDASharedMemory.h | 1 + include/nbl/video/IDeviceMemoryAllocation.h | 2 ++ src/nbl/video/CVulkanLogicalDevice.cpp | 10 +++++----- src/nbl/video/CVulkanMemoryAllocation.cpp | 7 ++++--- src/nbl/video/CVulkanMemoryAllocation.h | 7 +++++++ 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 2ce4a8067e..35965e5370 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -58,6 +58,7 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted ~CCUDASharedMemory() override; core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_allocation; SCachedCreationParams m_params; }; diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index cd15039203..52b541ceb5 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -187,6 +187,8 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted inline const SCreationParams& getCreationParams() const { return m_params; } + virtual external_handle_t getExternalHandle() const = 0; + protected: inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) { diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 5b1e2ec981..c22cfe93b9 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -194,7 +194,6 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca VkImportMemoryWin32HandleInfoKHR importInfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, .handleType = static_cast(info.externalHandleType), - .handle = info.externalHandle }; VkExportMemoryWin32HandleInfoKHR handleInfo = { @@ -219,12 +218,13 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca const void** pNext = &vk_allocateFlagsInfo.pNext; + external_handle_t externalHandle = ExternalHandleNull; if (info.externalHandleType) { if (info.externalHandle) //importing { - auto duped = DuplicateExternalHandle(info.externalHandle); - const_cast(info.externalHandle) = duped; + externalHandle = DuplicateExternalHandle(info.externalHandle); + importInfo.handle = externalHandle; *pNext = &importInfo; } else // exporting @@ -295,7 +295,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca #else vkGetMemoryFdKHR #endif - (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + (m_vkdev, &handleInfo, &externalHandle)) { m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); return {}; @@ -307,7 +307,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; IDeviceMemoryAllocator::SAllocation ret = {}; - ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, std::move(params)); + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, externalHandle, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) { diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index f2d64eceed..0ec6fc351d 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -6,14 +6,15 @@ namespace nbl::video CVulkanMemoryAllocation::CVulkanMemoryAllocation( const CVulkanLogicalDevice* dev, const VkDeviceMemory deviceMemoryHandle, + const external_handle_t externalHandle, SCreationParams&& params -) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} +) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle), m_externalHandle(externalHandle) {} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { - if (m_params.externalHandle != ExternalHandleNull) + if (m_externalHandle != ExternalHandleNull) { - bool re = CloseExternalHandle(getCreationParams().externalHandle); + bool re = CloseExternalHandle(m_externalHandle); assert(re); } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 22e32142c0..473d826595 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -17,11 +17,17 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation CVulkanMemoryAllocation( const CVulkanLogicalDevice* dev, const VkDeviceMemory deviceMemoryHandle, + const external_handle_t externalHandle, SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } + inline external_handle_t getExternalHandle() const override + { + return m_externalHandle; + } + private: ~CVulkanMemoryAllocation(); @@ -30,6 +36,7 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation core::smart_refctd_ptr m_vulkanDevice; const VkDeviceMemory m_deviceMemoryHandle; + const external_handle_t m_externalHandle; }; } From f04dcdb03b1d74b1354340d3d499050ea30de2d1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:38:13 +0700 Subject: [PATCH 045/149] Remove unused constructor parameter in CCUDASharedSemaphore --- include/nbl/video/CCUDASharedSemaphore.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h index 60daec7159..2277ea57cf 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -30,8 +30,7 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted CCUDASharedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, - CUexternalSemaphore semaphore, - external_handle_t osHandle) + CUexternalSemaphore semaphore) : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) From cea9d9e81f1ba9d5d2812bf68d258f27c20dccba Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:38:56 +0700 Subject: [PATCH 046/149] Implement CCUDAImportedMemory --- include/nbl/video/CCUDADevice.h | 3 ++ include/nbl/video/CCUDAImportedMemory.h | 42 +++++++++++++++++++++++++ src/nbl/CMakeLists.txt | 1 + src/nbl/video/CCUDADevice.cpp | 34 ++++++++++++++++++-- src/nbl/video/CCUDAImportedMemory.cpp | 33 +++++++++++++++++++ 5 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 include/nbl/video/CCUDAImportedMemory.h create mode 100644 src/nbl/video/CCUDAImportedMemory.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 6b3ab2bbb6..a80bbbbd28 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -7,6 +7,7 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDASharedSemaphore.h" @@ -91,6 +92,8 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); protected: diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h new file mode 100644 index 0000000000..8fbbccb31b --- /dev/null +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -0,0 +1,42 @@ +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H +#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +#endif // _NBL_COMPILE_WITH_CUDA + +namespace nbl::video +{ + +class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted +{ + public: + friend class CCUDADevice; + + CUexternalMemory getInternalObject() const { return m_handle; } + CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); + + protected: + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, + CUexternalMemory cuExtMem) : + m_device(device), + m_src(src), + m_handle(cuExtMem) {} + + ~CCUDAImportedMemory() override; + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalMemory m_handle; + +}; + +} + +#endif \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index bbec1b1691..eedfd514c6 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -296,6 +296,7 @@ set(NBL_VIDEO_SOURCES video/CCUDADevice.cpp video/CCUDASharedSemaphore.cpp video/CCUDASharedMemory.cpp + video/CCUDAImportedMemory.cpp ) set(NBL_SCENE_SOURCES diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 9e572fe119..535fb76d46 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -5,6 +5,8 @@ #include +#include "nbl/video/CCUDAImportedMemory.h" + #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { @@ -137,6 +139,35 @@ CUresult CCUDADevice::createSharedMemory( return CUDA_SUCCESS; } +CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) +{ + if (!mem || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + + auto& cu = m_handler->getCUDAFunctionTable(); + auto handleType = mem->getCreationParams().externalHandleType; + + if (!handleType) return CUDA_ERROR_INVALID_VALUE; + + const auto externalHandle = mem->getExternalHandle(); + + CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; +#ifdef _WIN32 + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; + extMemDesc.handle.win32.handle = externalHandle; +#else + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD; + extMemDesc.handle.fd = externalHandle; +#endif + extMemDesc.size = mem->getAllocationSize(); + + CUexternalMemory cuExtMem; + if (auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + return err; + *outPtr = core::smart_refctd_ptr(new CCUDAImportedMemory(core::smart_refctd_ptr(this), core::smart_refctd_ptr(mem), cuExtMem), core::dont_grab); + return CUDA_SUCCESS; +} + CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) { if (!sema || !outPtr) @@ -164,8 +195,7 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, {}), core::dont_grab); + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp new file mode 100644 index 0000000000..33ba43eb28 --- /dev/null +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +namespace nbl::video +{ + +CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) +{ + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; + bufferDesc.offset = 0; + bufferDesc.size = m_src->getAllocationSize(); + + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc); + +} + +CCUDAImportedMemory::~CCUDAImportedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + if (cu.pcuDestroyExternalMemory(m_handle) != CUDA_SUCCESS) + assert(!"Invalid code path"); +} + +} + +#endif \ No newline at end of file From 3ea3e9d5ae485b58069f3284ed646a28d7ab071c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 18:07:38 +0700 Subject: [PATCH 047/149] Rename CCUDASharedSemaphore into CCUDAImportedSemaphore --- include/nbl/video/CCUDADevice.h | 4 ++-- ...edSemaphore.h => CCUDAImportedSemaphore.h} | 10 +++++----- src/nbl/CMakeLists.txt | 2 +- src/nbl/video/CCUDADevice.cpp | 4 ++-- src/nbl/video/CCUDAImportedSemaphore.cpp | 19 +++++++++++++++++++ 5 files changed, 29 insertions(+), 10 deletions(-) rename include/nbl/video/{CCUDASharedSemaphore.h => CCUDAImportedSemaphore.h} (76%) create mode 100644 src/nbl/video/CCUDAImportedSemaphore.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index a80bbbbd28..c7778af0be 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -8,7 +8,7 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/CCUDASharedMemory.h" #include "nbl/video/CCUDAImportedMemory.h" -#include "nbl/video/CCUDASharedSemaphore.h" +#include "nbl/video/CCUDAImportedSemaphore.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -94,7 +94,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); - CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); protected: CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h similarity index 76% rename from include/nbl/video/CCUDASharedSemaphore.h rename to include/nbl/video/CCUDAImportedSemaphore.h index 2277ea57cf..d5139a55c9 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -1,8 +1,8 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ -#define _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -19,7 +19,7 @@ namespace nbl::video { -class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted +class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { public: friend class CCUDADevice; @@ -28,14 +28,14 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted protected: - CCUDASharedSemaphore(core::smart_refctd_ptr device, + CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, CUexternalSemaphore semaphore) : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) {} - ~CCUDASharedSemaphore() override; + ~CCUDAImportedSemaphore() override; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_src; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index eedfd514c6..52605112e5 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -294,7 +294,7 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp - video/CCUDASharedSemaphore.cpp + video/CCUDAImportedSemaphore.cpp video/CCUDASharedMemory.cpp video/CCUDAImportedMemory.cpp ) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 535fb76d46..30d4093fb1 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -168,7 +168,7 @@ CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) { if (!sema || !outPtr) return CUDA_ERROR_INVALID_VALUE; @@ -195,7 +195,7 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); + *outPtr = core::smart_refctd_ptr(new CCUDAImportedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp new file mode 100644 index 0000000000..69b851088e --- /dev/null +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAImportedSemaphore.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) + assert(!"Invalid code path."); +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file From 130cd1ef1124110f3a0a1f4b07e8d672c1a2e9e5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 18:11:13 +0700 Subject: [PATCH 048/149] Rename CCUDASharedMemory into CCUDAExportableMemory --- include/nbl/video/CCUDADevice.h | 4 ++-- .../{CCUDASharedMemory.h => CCUDAExportableMemory.h} | 10 +++++----- src/nbl/CMakeLists.txt | 2 +- src/nbl/video/CCUDADevice.cpp | 10 +++++----- ...CCUDASharedMemory.cpp => CCUDAExportableMemory.cpp} | 8 ++++---- 5 files changed, 17 insertions(+), 17 deletions(-) rename include/nbl/video/{CCUDASharedMemory.h => CCUDAExportableMemory.h} (82%) rename src/nbl/video/{CCUDASharedMemory.cpp => CCUDAExportableMemory.cpp} (83%) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index c7778af0be..869e84d691 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -6,7 +6,7 @@ #include "nbl/video/IPhysicalDevice.h" -#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDAImportedSemaphore.h" @@ -90,7 +90,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const CCUDAHandler* getHandler() const { return m_handler.get(); } bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } size_t roundToGranularity(CUmemLocationType location, size_t size) const; - CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDAExportableMemory.h similarity index 82% rename from include/nbl/video/CCUDASharedMemory.h rename to include/nbl/video/CCUDAExportableMemory.h index 35965e5370..8729c87338 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -1,8 +1,8 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ -#define _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ +#ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -24,7 +24,7 @@ class CCUDAMemoryMapping: public core::IReferenceCounted { }; -class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted +class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { public: friend class CCUDADevice; @@ -51,11 +51,11 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted protected: - CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) : m_device(std::move(device)) , m_params(std::move(params)) {} - ~CCUDASharedMemory() override; + ~CCUDAExportableMemory() override; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_allocation; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 52605112e5..692efec8bd 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -295,7 +295,7 @@ set(NBL_VIDEO_SOURCES video/CCUDAHandler.cpp video/CCUDADevice.cpp video/CCUDAImportedSemaphore.cpp - video/CCUDASharedMemory.cpp + video/CCUDAExportableMemory.cpp video/CCUDAImportedMemory.cpp ) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 30d4093fb1..bd54cce81e 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -88,14 +88,14 @@ CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } -CUresult CCUDADevice::createSharedMemory( - core::smart_refctd_ptr* outMem, - CCUDASharedMemory::SCreationParams&& inParams) +CUresult CCUDADevice::createExportableMemory( + core::smart_refctd_ptr* outMem, + CCUDAExportableMemory::SCreationParams&& inParams) { if (!outMem) return CUDA_ERROR_INVALID_VALUE; - CCUDASharedMemory::SCachedCreationParams params = { inParams }; + CCUDAExportableMemory::SCachedCreationParams params = { inParams }; auto& cu = m_handler->getCUDAFunctionTable(); @@ -134,7 +134,7 @@ CUresult CCUDADevice::createSharedMemory( return err; } - *outMem = core::smart_refctd_ptr(new CCUDASharedMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + *outMem = core::smart_refctd_ptr(new CCUDAExportableMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp similarity index 83% rename from src/nbl/video/CCUDASharedMemory.cpp rename to src/nbl/video/CCUDAExportableMemory.cpp index 34560e5575..bbe773f610 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -2,14 +2,14 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDADevice.h" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { -core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { auto pd = device->getPhysicalDevice(); uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; @@ -36,10 +36,10 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsMemor IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, m_params.externalHandle, - std::make_unique(core::smart_refctd_ptr(this))).memory; + std::make_unique(core::smart_refctd_ptr(this))).memory; } -CCUDASharedMemory::~CCUDASharedMemory() +CCUDAExportableMemory::~CCUDAExportableMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); From c624053e03598cfde6dac16660124506a17b6cb5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 18:54:18 +0700 Subject: [PATCH 049/149] Remove unused member in CCUDAExportableMemory --- include/nbl/video/CCUDAExportableMemory.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 8729c87338..b4df99d9f5 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -58,7 +58,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted ~CCUDAExportableMemory() override; core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_allocation; SCachedCreationParams m_params; }; From 9127faa8215ad6fe9de8dabd7563d7d9263d7b6f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 01:08:29 +0700 Subject: [PATCH 050/149] Slight rename to CCUDADevice method --- include/nbl/video/CCUDADevice.h | 4 ++-- src/nbl/video/CCUDADevice.cpp | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 869e84d691..89449a21f0 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -92,9 +92,9 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); - CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); + CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); - CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); protected: CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index bd54cce81e..3f933be988 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -139,11 +139,13 @@ CUresult CCUDADevice::createExportableMemory( return CUDA_SUCCESS; } -CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) +CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) { if (!mem || !outPtr) return CUDA_ERROR_INVALID_VALUE; + const auto memProperty = mem->getCreationParams().memoryPropertyFlags; + auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = mem->getCreationParams().externalHandleType; @@ -168,7 +170,7 @@ CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) { if (!sema || !outPtr) return CUDA_ERROR_INVALID_VALUE; From 059d1d5fa195abd08280bd032bfb3cc7d574a08d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 01:18:29 +0700 Subject: [PATCH 051/149] Merge with master --- include/nbl/builtin/hlsl/tgmath/impl.hlsl | 63 ++++++++++++----------- include/nbl/system/to_string.h | 10 ++++ include/nbl/video/CCUDAHandler.h | 57 ++++++-------------- 3 files changed, 59 insertions(+), 71 deletions(-) diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl index 4d1a30c757..0c1dc2f458 100644 --- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl +++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl @@ -197,12 +197,12 @@ struct erf_helper(NBL_FP64_LITERAL(0.254829592)); + const FloatingPoint a2 = _static_cast(NBL_FP64_LITERAL(-0.284496736)); + const FloatingPoint a3 = _static_cast(NBL_FP64_LITERAL(1.421413741)); + const FloatingPoint a4 = _static_cast(NBL_FP64_LITERAL(-1.453152027)); + const FloatingPoint a5 = _static_cast(NBL_FP64_LITERAL(1.061405429)); + const FloatingPoint p = _static_cast(NBL_FP64_LITERAL(0.3275911)); FloatingPoint _sign = FloatingPoint(sign(_x)); FloatingPoint x = abs(_x); @@ -393,10 +393,10 @@ struct erf_helper static float16_t __call(float16_t _x) { // A&S approximation to 2.5x10-5 - const float16_t a1 = float16_t(0.3480242f); - const float16_t a2 = float16_t(-0.0958798f); - const float16_t a3 = float16_t(0.7478556f); - const float16_t p = float16_t(0.47047f); + const float16_t a1 = _static_cast(0.3480242f); + const float16_t a2 = _static_cast(-0.0958798f); + const float16_t a3 = _static_cast(0.7478556f); + const float16_t p = _static_cast(0.47047f); float16_t _sign = float16_t(sign(_x)); float16_t x = abs_helper::__call(_x); @@ -414,35 +414,36 @@ struct erfInv_helper(_x, FloatingPoint(NBL_FP64_LITERAL(-0.99999)), FloatingPoint(NBL_FP64_LITERAL(0.99999))); + // TODO: maybe need to replace `FloatingPoint(NBL_FP64_LITERAL` with `_static_cast(NBL_FP64_LITERAL` to make DXC shut up + FloatingPoint x = clamp(_x, _static_cast(NBL_FP64_LITERAL(-0.99999)), _static_cast(NBL_FP64_LITERAL(0.99999))); - FloatingPoint w = -log_helper::__call((FloatingPoint(NBL_FP64_LITERAL(1.0)) - x) * (FloatingPoint(NBL_FP64_LITERAL(1.0)) + x)); + FloatingPoint w = -log_helper::__call((_static_cast(NBL_FP64_LITERAL(1.0)) - x) * (_static_cast(NBL_FP64_LITERAL(1.0)) + x)); FloatingPoint p; if (w < 5.0) { - w -= FloatingPoint(NBL_FP64_LITERAL(2.5)); - p = FloatingPoint(NBL_FP64_LITERAL(2.81022636e-08)); - p = FloatingPoint(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00021858087)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00125372503)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00417768164)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.246640727)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.50140941)) + p * w; + w -= _static_cast(NBL_FP64_LITERAL(2.5)); + p = _static_cast(NBL_FP64_LITERAL(2.81022636e-08)); + p = _static_cast(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00021858087)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00125372503)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00417768164)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.246640727)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.50140941)) + p * w; } else { w = sqrt_helper::__call(w) - FloatingPoint(NBL_FP64_LITERAL(3.0)); - p = FloatingPoint(NBL_FP64_LITERAL(-0.000200214257)); - p = FloatingPoint(NBL_FP64_LITERAL(0.000100950558)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00134934322)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00367342844)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00573950773)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.0076224613)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00943887047)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.00167406)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(2.83297682)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.000200214257)); + p = _static_cast(NBL_FP64_LITERAL(0.000100950558)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00134934322)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00367342844)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00573950773)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.0076224613)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00943887047)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.00167406)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(2.83297682)) + p * w; } return p * x; } diff --git a/include/nbl/system/to_string.h b/include/nbl/system/to_string.h index 2a06ace5e5..1f8988566e 100644 --- a/include/nbl/system/to_string.h +++ b/include/nbl/system/to_string.h @@ -1,6 +1,7 @@ #ifndef _NBL_SYSTEM_TO_STRING_INCLUDED_ #define _NBL_SYSTEM_TO_STRING_INCLUDED_ +#include #include #include #include @@ -21,6 +22,15 @@ struct to_string_helper } }; +template +struct to_string_helper +{ + static std::string __call(const T& value) + { + return std::format("{}", value); + } +}; + template<> struct to_string_helper { diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 9de55914b5..01774b25d2 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,9 +16,9 @@ namespace nbl::video { -class NBL_API2 CCUDAHandler : public core::IReferenceCounted +class CCUDAHandler : public core::IReferenceCounted { - public: + public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); inline bool defaultHandleResult(CUresult result) { @@ -34,12 +34,12 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v4 + ,cuCtxCreate_v2 ,cuDevicePrimaryCtxRetain ,cuDevicePrimaryCtxRelease ,cuDevicePrimaryCtxSetFlags @@ -62,7 +62,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuDeviceGet ,cuDeviceGetAttribute ,cuDeviceGetLuid - ,cuDeviceGetUuid_v2 + ,cuDeviceGetUuid ,cuDeviceTotalMem_v2 ,cuDeviceGetName ,cuDriverGetVersion @@ -119,24 +119,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy - ,cuImportExternalMemory - ,cuDestroyExternalMemory - ,cuExternalMemoryGetMappedBuffer - ,cuMemUnmap - ,cuMemAddressFree - ,cuMemGetAllocationGranularity - ,cuMemAddressReserve - ,cuMemCreate - ,cuMemExportToShareableHandle - ,cuMemMap - ,cuMemRelease - ,cuMemSetAccess - ,cuMemImportFromShareableHandle - ,cuLaunchHostFunc - ,cuDestroyExternalSemaphore - ,cuImportExternalSemaphore - ,cuSignalExternalSemaphoresAsync - ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -175,25 +157,13 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::IFile::success_t bytesRead; + system::future bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.getBytesProcessed()); + source.resize(bytesRead.get()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } - struct SCUDADeviceInfo - { - CUdevice handle = {}; - CUuuid uuid = {}; - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - }; - - inline core::vector const& getAvailableDevices() const - { - return m_availableDevices; - } - // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -258,8 +228,16 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) + : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) + { + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + } ~CCUDAHandler() = default; // @@ -282,7 +260,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted NVRTC m_nvrtc; // - core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; From 2eb8fee018a1877cca265efaf929ea78bbeee440 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 01:37:55 +0700 Subject: [PATCH 052/149] Add option for _NBL_COMPILE_WITH_CUDA_ --- src/nbl/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 692efec8bd..4c2f0571dd 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -68,6 +68,7 @@ option(_NBL_COMPILE_WITH_GLI_LOADER_ "Compile with GLI Loader" ON) option(_NBL_COMPILE_WITH_GLI_WRITER_ "Compile with GLI Writer" ON) option(_NBL_COMPILE_WITH_GLTF_LOADER_ "Compile with GLTF Loader" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER option(_NBL_COMPILE_WITH_GLTF_WRITER_ "Compile with GLTF Writer" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER +option(_NBL_COMPILE_WITH_CUDA_ "Compile with CUDA" ON) set(_NBL_EG_PRFNT_LEVEL 0 CACHE STRING "EasterEgg Profanity Level") option(NBL_EXPLICIT_MODULE_LOAD_LOG "Enable Runtime logs for external dynamic module loading" OFF) @@ -95,9 +96,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (NBL_COMPILE_WITH_CUDA) +if (_NBL_COMPILE_WITH_CUDA_) message(STATUS "Building with CUDA interop") - set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) if (NBL_BUILD_OPTIX) set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() @@ -425,7 +425,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() -if(NBL_COMPILE_WITH_CUDA) +if(_NBL_COMPILE_WITH_CUDA_) target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) endif() @@ -665,7 +665,7 @@ target_link_libraries(Nabla PRIVATE volk) target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) # CUDA -if (NBL_COMPILE_WITH_CUDA) +if (_NBL_COMPILE_WITH_CUDA_) list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") endif() From 6605bebf70742dbd64530b8cfeb93e911c5850fc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 13:42:49 +0700 Subject: [PATCH 053/149] Revert to correct state before merging with master --- include/nbl/builtin/hlsl/tgmath/impl.hlsl | 63 +++++++++++------------ include/nbl/video/CCUDAHandler.h | 57 ++++++++++++++------ 2 files changed, 71 insertions(+), 49 deletions(-) diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl index 0c1dc2f458..4d1a30c757 100644 --- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl +++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl @@ -197,12 +197,12 @@ struct erf_helper(NBL_FP64_LITERAL(0.254829592)); - const FloatingPoint a2 = _static_cast(NBL_FP64_LITERAL(-0.284496736)); - const FloatingPoint a3 = _static_cast(NBL_FP64_LITERAL(1.421413741)); - const FloatingPoint a4 = _static_cast(NBL_FP64_LITERAL(-1.453152027)); - const FloatingPoint a5 = _static_cast(NBL_FP64_LITERAL(1.061405429)); - const FloatingPoint p = _static_cast(NBL_FP64_LITERAL(0.3275911)); + const FloatingPoint a1 = FloatingPoint(NBL_FP64_LITERAL(0.254829592)); + const FloatingPoint a2 = FloatingPoint(NBL_FP64_LITERAL(-0.284496736)); + const FloatingPoint a3 = FloatingPoint(NBL_FP64_LITERAL(1.421413741)); + const FloatingPoint a4 = FloatingPoint(NBL_FP64_LITERAL(-1.453152027)); + const FloatingPoint a5 = FloatingPoint(NBL_FP64_LITERAL(1.061405429)); + const FloatingPoint p = FloatingPoint(NBL_FP64_LITERAL(0.3275911)); FloatingPoint _sign = FloatingPoint(sign(_x)); FloatingPoint x = abs(_x); @@ -393,10 +393,10 @@ struct erf_helper static float16_t __call(float16_t _x) { // A&S approximation to 2.5x10-5 - const float16_t a1 = _static_cast(0.3480242f); - const float16_t a2 = _static_cast(-0.0958798f); - const float16_t a3 = _static_cast(0.7478556f); - const float16_t p = _static_cast(0.47047f); + const float16_t a1 = float16_t(0.3480242f); + const float16_t a2 = float16_t(-0.0958798f); + const float16_t a3 = float16_t(0.7478556f); + const float16_t p = float16_t(0.47047f); float16_t _sign = float16_t(sign(_x)); float16_t x = abs_helper::__call(_x); @@ -414,36 +414,35 @@ struct erfInv_helper(NBL_FP64_LITERAL` to make DXC shut up - FloatingPoint x = clamp(_x, _static_cast(NBL_FP64_LITERAL(-0.99999)), _static_cast(NBL_FP64_LITERAL(0.99999))); + FloatingPoint x = clamp(_x, FloatingPoint(NBL_FP64_LITERAL(-0.99999)), FloatingPoint(NBL_FP64_LITERAL(0.99999))); - FloatingPoint w = -log_helper::__call((_static_cast(NBL_FP64_LITERAL(1.0)) - x) * (_static_cast(NBL_FP64_LITERAL(1.0)) + x)); + FloatingPoint w = -log_helper::__call((FloatingPoint(NBL_FP64_LITERAL(1.0)) - x) * (FloatingPoint(NBL_FP64_LITERAL(1.0)) + x)); FloatingPoint p; if (w < 5.0) { - w -= _static_cast(NBL_FP64_LITERAL(2.5)); - p = _static_cast(NBL_FP64_LITERAL(2.81022636e-08)); - p = _static_cast(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00021858087)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.00125372503)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.00417768164)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.246640727)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(1.50140941)) + p * w; + w -= FloatingPoint(NBL_FP64_LITERAL(2.5)); + p = FloatingPoint(NBL_FP64_LITERAL(2.81022636e-08)); + p = FloatingPoint(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00021858087)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.00125372503)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.00417768164)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.246640727)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(1.50140941)) + p * w; } else { w = sqrt_helper::__call(w) - FloatingPoint(NBL_FP64_LITERAL(3.0)); - p = _static_cast(NBL_FP64_LITERAL(-0.000200214257)); - p = _static_cast(NBL_FP64_LITERAL(0.000100950558)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00134934322)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.00367342844)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00573950773)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.0076224613)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00943887047)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(1.00167406)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(2.83297682)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.000200214257)); + p = FloatingPoint(NBL_FP64_LITERAL(0.000100950558)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00134934322)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.00367342844)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00573950773)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.0076224613)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00943887047)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(1.00167406)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(2.83297682)) + p * w; } return p * x; } diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..9de55914b5 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,9 +16,9 @@ namespace nbl::video { -class CCUDAHandler : public core::IReferenceCounted +class NBL_API2 CCUDAHandler : public core::IReferenceCounted { - public: + public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); inline bool defaultHandleResult(CUresult result) { @@ -34,12 +34,12 @@ class CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v2 + ,cuCtxCreate_v4 ,cuDevicePrimaryCtxRetain ,cuDevicePrimaryCtxRelease ,cuDevicePrimaryCtxSetFlags @@ -62,7 +62,7 @@ class CCUDAHandler : public core::IReferenceCounted ,cuDeviceGet ,cuDeviceGetAttribute ,cuDeviceGetLuid - ,cuDeviceGetUuid + ,cuDeviceGetUuid_v2 ,cuDeviceTotalMem_v2 ,cuDeviceGetName ,cuDriverGetVersion @@ -119,6 +119,24 @@ class CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -157,13 +175,25 @@ class CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::future bytesRead; + system::IFile::success_t bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); + source.resize(bytesRead.getBytesProcessed()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } + struct SCUDADeviceInfo + { + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + }; + + inline core::vector const& getAvailableDevices() const + { + return m_availableDevices; + } + // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -228,16 +258,8 @@ class CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + ~CCUDAHandler() = default; // @@ -260,6 +282,7 @@ class CCUDAHandler : public core::IReferenceCounted NVRTC m_nvrtc; // + core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; From af35f4f24df3ce4a670c9addebccb3429fa2ff8c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 13:43:59 +0700 Subject: [PATCH 054/149] Revert "Add option for _NBL_COMPILE_WITH_CUDA_" This reverts commit 2eb8fee018a1877cca265efaf929ea78bbeee440. --- src/nbl/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 4c2f0571dd..692efec8bd 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -68,7 +68,6 @@ option(_NBL_COMPILE_WITH_GLI_LOADER_ "Compile with GLI Loader" ON) option(_NBL_COMPILE_WITH_GLI_WRITER_ "Compile with GLI Writer" ON) option(_NBL_COMPILE_WITH_GLTF_LOADER_ "Compile with GLTF Loader" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER option(_NBL_COMPILE_WITH_GLTF_WRITER_ "Compile with GLTF Writer" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER -option(_NBL_COMPILE_WITH_CUDA_ "Compile with CUDA" ON) set(_NBL_EG_PRFNT_LEVEL 0 CACHE STRING "EasterEgg Profanity Level") option(NBL_EXPLICIT_MODULE_LOAD_LOG "Enable Runtime logs for external dynamic module loading" OFF) @@ -96,8 +95,9 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (_NBL_COMPILE_WITH_CUDA_) +if (NBL_COMPILE_WITH_CUDA) message(STATUS "Building with CUDA interop") + set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) if (NBL_BUILD_OPTIX) set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() @@ -425,7 +425,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() -if(_NBL_COMPILE_WITH_CUDA_) +if(NBL_COMPILE_WITH_CUDA) target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) endif() @@ -665,7 +665,7 @@ target_link_libraries(Nabla PRIVATE volk) target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) # CUDA -if (_NBL_COMPILE_WITH_CUDA_) +if (NBL_COMPILE_WITH_CUDA) list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") endif() From 2479fb2e64bc9704779e10d496d0a71ef7bb7846 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 17:55:44 +0700 Subject: [PATCH 055/149] Slight fix --- src/nbl/video/CVulkanDeviceMemoryBacked.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 39c0efae19..955885b7ae 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked Date: Thu, 23 Apr 2026 17:59:08 +0700 Subject: [PATCH 056/149] Slight fix on linux handle --- src/nbl/video/CVulkanLogicalDevice.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index c22cfe93b9..74e8be47bf 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -204,7 +204,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca VkImportMemoryFdInfoKHR importInfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, .handleType = static_cast(info.externalHandleType), - .fd = (int)info.externalHandle, + .fd = info.externalHandle, }; #endif @@ -224,7 +224,11 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca if (info.externalHandle) //importing { externalHandle = DuplicateExternalHandle(info.externalHandle); +#ifdef _WIN32 importInfo.handle = externalHandle; +#else + importInfo.fd = externalHandle; +#endif *pNext = &importInfo; } else // exporting From 3df125b804a5147cdf78229d2ded029a64825d5d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:00:59 +0700 Subject: [PATCH 057/149] Fix typo --- include/nbl/asset/IBuffer.h | 2 +- include/nbl/video/IPhysicalDevice.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index 92ffd3eb4d..99f85e0b72 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -43,7 +43,7 @@ class IBuffer : public IDescriptor, public core::IBuffer // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, - EUF_SYNTHEHIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ + EUF_SYNTHETIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index f8550debce..e3cfe15a90 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -661,7 +661,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const { - usages &= ~asset::IBuffer::EUF_SYNTHEHIC_FLAGS_MASK; // mask out synthetic flags + usages &= ~asset::IBuffer::EUF_SYNTHETIC_FLAGS_MASK; // mask out synthetic flags // TODO(kevinyu): Should we cached the properties like Atil does. If yes, needs mutex and mutable specifier. Class become not that simple anymore. // { From 2e2ca3f2a148aae96cb23aa1fc1bbe789753e5f6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:09:05 +0700 Subject: [PATCH 058/149] Fix CCUDAImportedSemaphore constructor --- include/nbl/video/CCUDAImportedSemaphore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index d5139a55c9..4d014b9e39 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -32,7 +32,7 @@ class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted core::smart_refctd_ptr src, CUexternalSemaphore semaphore) : m_device(std::move(device)) - , m_src(std::move(m_src)) + , m_src(std::move(src)) , m_handle(semaphore) {} ~CCUDAImportedSemaphore() override; From 8c4c91e273a9146ae6c4c3e9843ef0d37c4b4b75 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:11:56 +0700 Subject: [PATCH 059/149] Remove unused CCUDASharedSemaphore.cpp --- src/nbl/video/CCUDASharedSemaphore.cpp | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 src/nbl/video/CCUDASharedSemaphore.cpp diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp deleted file mode 100644 index ae2291035a..0000000000 --- a/src/nbl/video/CCUDASharedSemaphore.cpp +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#include "nbl/video/CCUDASharedSemaphore.h" -#include "nbl/video/CCUDADevice.h" - -#ifdef _NBL_COMPILE_WITH_CUDA_ -namespace nbl::video -{ -CCUDASharedSemaphore::~CCUDASharedSemaphore() -{ - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) - assert(!"Invalid code path."); -} -} - -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file From fcec2684a22576b159e422a8fd7f2847e2a530b6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:20:11 +0700 Subject: [PATCH 060/149] Fix handle type for Linux --- src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 74e8be47bf..5cc0dbd8f3 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -279,7 +279,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca #ifdef _WIN32 VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, #else - VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, #endif .memory = vk_deviceMemory, .handleType = static_cast(info.externalHandleType), From ac1878160267085b3d6b1d310999d99322ba0c91 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 19:05:06 +0700 Subject: [PATCH 061/149] Add missing external handle type and make the constant consistent --- include/nbl/video/CCUDADevice.h | 2 +- include/nbl/video/IDeviceMemoryAllocation.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 89449a21f0..d30e7b18c5 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -34,7 +34,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; #else - static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD; + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 52b541ceb5..e75acf2fd0 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -74,6 +74,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted enum E_EXTERNAL_HANDLE_TYPE : uint32_t { EHT_NONE = 0, + EHT_OPAQUE_FD = 0x00000001, EHT_OPAQUE_WIN32 = 0x00000002, EHT_OPAQUE_WIN32_KMT = 0x00000004, EHT_D3D11_TEXTURE = 0x00000008, From d73c851440cc2aabed9b6461bf51e22d1795eff2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 20:06:19 +0700 Subject: [PATCH 062/149] Slight fix --- include/nbl/video/CCUDADevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index d30e7b18c5..7f51443972 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -35,7 +35,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; #else static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; - static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif struct SCUDACleaner : video::ICleanup From 2c75ed882e1d1a0da1feadb8336a2fdd4f76909d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:32:43 +0700 Subject: [PATCH 063/149] Fix indentation and refactor to be more idiomatic --- include/nbl/video/CCUDADevice.h | 4 +- include/nbl/video/CCUDAExportableMemory.h | 55 ++++++++++++---------- include/nbl/video/CCUDAImportedMemory.h | 28 +++++------ include/nbl/video/CCUDAImportedSemaphore.h | 37 +++++++-------- src/nbl/video/CCUDADevice.cpp | 10 ++-- 5 files changed, 67 insertions(+), 67 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 7f51443972..ce9d0ea3b2 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -92,9 +92,9 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); - CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); + CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem); - CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); protected: CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index b4df99d9f5..d96a5ad62b 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -10,7 +10,7 @@ #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." + #error "Need CUDA 9.0 SDK or higher." #endif // useful includes in the future @@ -24,41 +24,44 @@ class CCUDAMemoryMapping: public core::IReferenceCounted { }; +class CCUDADevice; + class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { -public: - friend class CCUDADevice; + public: - CUdeviceptr getDeviceptr() const { return m_params.ptr; } + struct SCreationParams + { + size_t size; + uint32_t alignment; + CUmemLocationType location; + }; - struct SCreationParams - { - size_t size; - uint32_t alignment; - CUmemLocationType location; - }; + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + CUdeviceptr ptr; + external_handle_t externalHandle; + }; - struct SCachedCreationParams : SCreationParams - { - size_t granularSize; - CUdeviceptr ptr; - external_handle_t externalHandle; - }; + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, CUmemGenericAllocationHandle allocationHandle) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_allocationHandle(allocationHandle) + {} + ~CCUDAExportableMemory() override; - const SCreationParams& getCreationParams() const { return m_params; } + CUdeviceptr getDeviceptr() const { return m_params.ptr; } - core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + const SCreationParams& getCreationParams() const { return m_params; } -protected: + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) - : m_device(std::move(device)) - , m_params(std::move(params)) - {} - ~CCUDAExportableMemory() override; + private: - core::smart_refctd_ptr m_device; - SCachedCreationParams m_params; + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; + CUmemGenericAllocationHandle m_allocationHandle; }; } diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 8fbbccb31b..4e3bfcd085 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -16,24 +16,24 @@ namespace nbl::video class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted { - public: - friend class CCUDADevice; + public: - CUexternalMemory getInternalObject() const { return m_handle; } - CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, + CUexternalMemory cuExtMem) : + m_device(device), + m_src(src), + m_handle(cuExtMem) {} - protected: - CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, - CUexternalMemory cuExtMem) : - m_device(device), - m_src(src), - m_handle(cuExtMem) {} + ~CCUDAImportedMemory() override; - ~CCUDAImportedMemory() override; + CUexternalMemory getInternalObject() const { return m_handle; } + CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalMemory m_handle; + private: + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalMemory m_handle; }; diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index 4d014b9e39..2e5010fa2d 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -9,7 +9,7 @@ #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." + #error "Need CUDA 9.0 SDK or higher." #endif // useful includes in the future @@ -21,25 +21,22 @@ namespace nbl::video class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { -public: - friend class CCUDADevice; - - CUexternalSemaphore getInternalObject() const { return m_handle; } - -protected: - - CCUDAImportedSemaphore(core::smart_refctd_ptr device, - core::smart_refctd_ptr src, - CUexternalSemaphore semaphore) - : m_device(std::move(device)) - , m_src(std::move(src)) - , m_handle(semaphore) - {} - ~CCUDAImportedSemaphore() override; - - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalSemaphore m_handle; + public: + + CUexternalSemaphore getInternalObject() const { return m_handle; } + CCUDAImportedSemaphore(core::smart_refctd_ptr device, + core::smart_refctd_ptr src, + CUexternalSemaphore semaphore) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_handle(semaphore) + {} + ~CCUDAImportedSemaphore() override; + + private: + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalSemaphore m_handle; }; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 3f933be988..423491df6d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -134,12 +134,12 @@ CUresult CCUDADevice::createExportableMemory( return err; } - *outMem = core::smart_refctd_ptr(new CCUDAExportableMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + *outMem = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); return CUDA_SUCCESS; } -CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) +CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem) { if (!mem || !outPtr) return CUDA_ERROR_INVALID_VALUE; @@ -166,11 +166,11 @@ CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr(new CCUDAImportedMemory(core::smart_refctd_ptr(this), core::smart_refctd_ptr(mem), cuExtMem), core::dont_grab); + *outPtr = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); return CUDA_SUCCESS; } -CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sema) { if (!sema || !outPtr) return CUDA_ERROR_INVALID_VALUE; @@ -197,7 +197,7 @@ CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr(new CCUDAImportedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); + *outPtr = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); return CUDA_SUCCESS; } From 3e905e9ce2085954ec551f93707b9ff3c8f978d6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:34:03 +0700 Subject: [PATCH 064/149] Add some comment --- include/nbl/video/EApiType.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 0726049200..7f99d40309 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,6 +13,7 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; +// TODO(kevinyu): Should I move this type and functions to its own file? using external_handle_t = #ifdef _WIN32 void* From 963a3d66732f90a77866100bd3a93c561d721cf2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:34:48 +0700 Subject: [PATCH 065/149] Fix typo --- include/nbl/video/CCUDADevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index ce9d0ea3b2..57e1e6bd53 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -97,7 +97,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); protected: - CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 423491df6d..b7aae1e3d9 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -57,7 +57,7 @@ size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; } -CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) { auto& cu = m_handler->getCUDAFunctionTable(); @@ -121,7 +121,7 @@ CUresult CCUDADevice::createExportableMemory( return err; } - if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + if (auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { CloseExternalHandle(params.externalHandle); cu.pcuMemRelease(mem); From 0de37b0646dc8317fd1577480ce9963406667e87 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:35:29 +0700 Subject: [PATCH 066/149] Slight improvement --- include/nbl/video/CCUDADevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 57e1e6bd53..bf0cb7d899 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -97,7 +97,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); protected: - CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index b7aae1e3d9..1e6d020161 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -57,7 +57,7 @@ size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; } -CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const { auto& cu = m_handler->getCUDAFunctionTable(); From d50d709a4c06370cb0281eb735ace856240d72bf Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:36:30 +0700 Subject: [PATCH 067/149] Remove unused variable --- src/nbl/video/CCUDADevice.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 1e6d020161..c523a32a98 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -144,8 +144,6 @@ CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptrgetCreationParams().memoryPropertyFlags; - auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = mem->getCreationParams().externalHandleType; From 763d173a4303eaa7794998638f01097108fb61c2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:56:31 +0700 Subject: [PATCH 068/149] Add include WIN32 include guard --- include/nbl/video/CCUDADevice.h | 4 +++- src/nbl/video/CCUDADevice.cpp | 38 +++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index bf0cb7d899..766f06c82c 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -96,9 +96,11 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); - protected: + private: CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; + CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; + friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); ~CCUDADevice(); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index c523a32a98..11e764f2f5 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -3,7 +3,9 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDADevice.h" +#ifdef _WIN32 #include +#endif #include "nbl/video/CCUDAImportedMemory.h" @@ -39,13 +41,7 @@ CCUDADevice::CCUDADevice( for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) { - uint32_t metaData[16] = { 48 }; - CUmemAllocationProp prop = { - .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = {.type = static_cast(i), .id = m_handle }, - .win32HandleMetaData = metaData, - }; + const auto prop = getMemAllocationProp(static_cast(i)); auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); assert(CUDA_SUCCESS == re); @@ -88,6 +84,24 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } +CUmemAllocationProp CCUDADevice::getMemAllocationProp(CUmemLocationType locationType) const +{ + +#ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = {}; + metadata.Length = sizeof(OBJECT_ATTRIBUTES); +#endif + + return { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = locationType, .id = m_handle }, +#ifdef _WIN32 + .win32HandleMetaData = &metadata, +#endif + }; +} + CUresult CCUDADevice::createExportableMemory( core::smart_refctd_ptr* outMem, CCUDAExportableMemory::SCreationParams&& inParams) @@ -99,15 +113,7 @@ CUresult CCUDADevice::createExportableMemory( auto& cu = m_handler->getCUDAFunctionTable(); - OBJECT_ATTRIBUTES metadata = {}; - metadata.Length = sizeof(OBJECT_ATTRIBUTES); - - CUmemAllocationProp prop = { - .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = params.location, .id = m_handle }, - .win32HandleMetaData = &metadata, - }; + const auto prop = getMemAllocationProp(params.location); params.granularSize = roundToGranularity(params.location, params.size); From d71e52d6e94b141913f04eaca25f7127de5e72a2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:58:01 +0700 Subject: [PATCH 069/149] Remove unused class --- include/nbl/video/CCUDAExportableMemory.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index d96a5ad62b..aa197a2e4c 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -20,10 +20,6 @@ namespace nbl::video { -class CCUDAMemoryMapping: public core::IReferenceCounted -{ -}; - class CCUDADevice; class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted From cfad81644ea99f4786303c2fe75054cfedc07328 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 00:08:45 +0700 Subject: [PATCH 070/149] Refactor CCUDADevice api to be more consistent with vulkan device api --- include/nbl/video/CCUDADevice.h | 10 +++++--- src/nbl/video/CCUDADevice.cpp | 42 +++++++++++---------------------- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 766f06c82c..8654e81571 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -87,14 +87,18 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted } CUdevice getInternalObject() const { return m_handle; } + const CCUDAHandler* getHandler() const { return m_handler.get(); } + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + size_t roundToGranularity(CUmemLocationType location, size_t size) const; - CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); - CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem); + core::smart_refctd_ptr createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); + + core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); - CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); + core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); private: CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 11e764f2f5..bedc582b9f 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -102,13 +102,8 @@ CUmemAllocationProp CCUDADevice::getMemAllocationProp(CUmemLocationType location }; } -CUresult CCUDADevice::createExportableMemory( - core::smart_refctd_ptr* outMem, - CCUDAExportableMemory::SCreationParams&& inParams) +core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) { - if (!outMem) - return CUDA_ERROR_INVALID_VALUE; - CCUDAExportableMemory::SCachedCreationParams params = { inParams }; auto& cu = m_handler->getCUDAFunctionTable(); @@ -119,41 +114,37 @@ CUresult CCUDADevice::createExportableMemory( CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) - return err; + return nullptr; if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { cu.pcuMemRelease(mem); - return err; + return nullptr; } if (auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { CloseExternalHandle(params.externalHandle); cu.pcuMemRelease(mem); - return err; + return nullptr; } if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { CloseExternalHandle(params.externalHandle); - return err; + return nullptr; } - *outMem = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); - - return CUDA_SUCCESS; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); } -CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem) +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - if (!mem || !outPtr) - return CUDA_ERROR_INVALID_VALUE; auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = mem->getCreationParams().externalHandleType; - if (!handleType) return CUDA_ERROR_INVALID_VALUE; + if (!handleType) return nullptr; const auto externalHandle = mem->getExternalHandle(); @@ -169,21 +160,17 @@ CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); - return CUDA_SUCCESS; + return nullptr; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); } -CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sema) +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - if (!sema || !outPtr) - return CUDA_ERROR_INVALID_VALUE; - auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = sema->getCreationParams().externalHandleTypes.value; if (!handleType) - return CUDA_ERROR_INVALID_VALUE; + return nullptr; CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { #ifdef _WIN32 @@ -199,10 +186,9 @@ CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); - return CUDA_SUCCESS; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); } CCUDADevice::~CCUDADevice() From b22168e6f6238b415a1d8902d1f68bd7967149fb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 00:12:25 +0700 Subject: [PATCH 071/149] Refactor constructor parameter naming --- include/nbl/video/CCUDADevice.h | 8 ++++---- src/nbl/video/CCUDADevice.cpp | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 8654e81571..b3be2a9014 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -81,6 +81,10 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted }; inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler); + + ~CCUDADevice(); + inline core::SRange geDefaultCompileOptions() const { return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; @@ -104,10 +108,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; - - friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); - ~CCUDADevice(); std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index bedc582b9f..25c93222da 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -14,17 +14,17 @@ namespace nbl::video { CCUDADevice::CCUDADevice( - core::smart_refctd_ptr&& _vulkanConnection, - IPhysicalDevice* const _vulkanDevice, - const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, - CUdevice _device, - core::smart_refctd_ptr&& _handler) : + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + CUdevice device, + core::smart_refctd_ptr&& handler) : m_defaultCompileOptions(), - m_vulkanConnection(std::move(_vulkanConnection)), - m_vulkanDevice(_vulkanDevice), - m_virtualArchitecture(_virtualArchitecture), - m_handle(_device), - m_handler(std::move(_handler)), + m_vulkanConnection(std::move(vulkanConnection)), + m_vulkanDevice(vulkanDevice), + m_virtualArchitecture(virtualArchitecture), + m_handle(device), + m_handler(std::move(handler)), m_allocationGranularity{} { m_defaultCompileOptions.push_back("--std=c++14"); From 5bd64ae2e9540d539d15b2aeeb3dee7afc7446ee Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 00:14:07 +0700 Subject: [PATCH 072/149] Idiomatic way to create core::smart_refctd_ptr --- src/nbl/video/CCUDAHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 770db41946..cee01b976b 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -664,7 +664,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); + return core::make_smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); } } return nullptr; From bd0f8a270f062c20aa0099d04a0a1fa8e870ce24 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 01:10:39 +0700 Subject: [PATCH 073/149] Fix destruction and remove unnecessary SCUDACleaner --- include/nbl/video/CCUDADevice.h | 7 ------- include/nbl/video/IDeviceMemoryAllocation.h | 5 ----- include/nbl/video/IDeviceMemoryAllocator.h | 6 +----- src/nbl/video/CCUDAExportableMemory.cpp | 10 +++++----- src/nbl/video/CCUDAHandler.cpp | 2 +- 5 files changed, 7 insertions(+), 23 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index b3be2a9014..ffa006b4d9 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -38,13 +38,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif - struct SCUDACleaner : video::ICleanup - { - core::smart_refctd_ptr resource; - SCUDACleaner(core::smart_refctd_ptr resource) - : resource(std::move(resource)) - {} - }; enum E_VIRTUAL_ARCHITECTURE { EVA_30, diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index e75acf2fd0..6120574baa 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -191,10 +191,6 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted virtual external_handle_t getExternalHandle() const = 0; protected: - inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) - { - m_postDestroyCleanup = std::move(cleanup); - } IDeviceMemoryAllocation( const ILogicalDevice* originDevice, SCreationParams&& params = {}) @@ -213,7 +209,6 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted uint8_t* m_mappedPtr = nullptr; MemoryRange m_mappedRange = {}; core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - std::unique_ptr m_postDestroyCleanup = nullptr; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 8fc07dd698..797536113c 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -125,18 +125,14 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked* dedication = nullptr, const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, - external_handle_t externalHandle = {}, - std::unique_ptr&& postDestroyCleanup = nullptr) + external_handle_t externalHandle = {}) { for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); auto allocation = allocate(allocateInfo); if (allocation.isValid()) - { - allocation.memory->setPostDestroyCleanup(std::move(postDestroyCleanup)); return allocation; - } } return {}; } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index bbe773f610..e778a46875 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -35,17 +35,17 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM dedication, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - m_params.externalHandle, - std::make_unique(core::smart_refctd_ptr(this))).memory; + m_params.externalHandle).memory; } CCUDAExportableMemory::~CCUDAExportableMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - CUresult re[] = { - cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), - }; + cu.pcuMemUnmap(m_params.ptr, m_params.granularSize); + cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize); + cu.pcuMemRelease(m_allocationHandle); + CloseExternalHandle(m_params.externalHandle); } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index cee01b976b..19528d4816 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -664,7 +664,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::make_smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); + return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)); } } return nullptr; From 6d47b9000d4ea647973c65ce75a00df45f33009e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 23:58:05 +0700 Subject: [PATCH 074/149] CCUDAHandler construction more idiomatic --- src/nbl/video/CCUDAHandler.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 19528d4816..060afe6631 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -526,9 +526,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste )); } - - CCUDAHandler* handler = new CCUDAHandler(std::move(cuda), std::move(nvrtc),std::move(headers), std::move(_logger), cudaVersion); - return core::smart_refctd_ptr(handler,core::dont_grab); + return core::make_smart_refctd_ptr(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion); } nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) From 0257d9afbff4779532a1fa0042d565ffc29ad0fc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Apr 2026 00:31:06 +0700 Subject: [PATCH 075/149] Refactor magic number --- include/nbl/video/CCUDADevice.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index ffa006b4d9..f19a7fdae6 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -87,7 +87,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const CCUDAHandler* getHandler() const { return m_handler.get(); } - bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } size_t roundToGranularity(CUmemLocationType location, size_t size) const; @@ -102,15 +102,18 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; + static constexpr auto CudaMemoryLocationCount = 5; + + const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - IPhysicalDevice* const m_vulkanDevice; + IPhysicalDevice* const m_physicalDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; core::smart_refctd_ptr m_handler; CUdevice m_handle; CUcontext m_context; - size_t m_allocationGranularity[4]; + std::array m_allocationGranularity; }; } From 099999478b1574930dfafac2e22e936931452781 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Apr 2026 00:32:03 +0700 Subject: [PATCH 076/149] Remove releasing allocationHandle in destructor, since we already call it after cuMemMap --- include/nbl/video/CCUDAExportableMemory.h | 4 +--- src/nbl/video/CCUDAExportableMemory.cpp | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index aa197a2e4c..1c3d206906 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -40,10 +40,9 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted external_handle_t externalHandle; }; - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, CUmemGenericAllocationHandle allocationHandle) + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) : m_device(std::move(device)) , m_params(std::move(params)) - , m_allocationHandle(allocationHandle) {} ~CCUDAExportableMemory() override; @@ -57,7 +56,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted core::smart_refctd_ptr m_device; SCachedCreationParams m_params; - CUmemGenericAllocationHandle m_allocationHandle; }; } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index e778a46875..e6c6b67509 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -44,7 +44,6 @@ CCUDAExportableMemory::~CCUDAExportableMemory() cu.pcuMemUnmap(m_params.ptr, m_params.granularSize); cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize); - cu.pcuMemRelease(m_allocationHandle); CloseExternalHandle(m_params.externalHandle); From 6f4b889cbb9cbb1586b99ecb4e62adfdac6be965 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Apr 2026 00:32:38 +0700 Subject: [PATCH 077/149] Input validation and error logging --- include/nbl/video/CCUDAHandler.h | 15 ++++- src/nbl/video/CCUDADevice.cpp | 72 ++++++++++++++---------- src/nbl/video/CCUDAExportableMemory.cpp | 10 ++-- src/nbl/video/CCUDAImportedMemory.cpp | 3 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 3 +- 5 files changed, 63 insertions(+), 40 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 9de55914b5..602637f202 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,11 +16,13 @@ namespace nbl::video { + class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); - inline bool defaultHandleResult(CUresult result) + static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); + + inline bool defaultHandleResult(CUresult result) const { core::smart_refctd_ptr logger = m_logger.get(); return defaultHandleResult(result,logger.get()); @@ -137,6 +139,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuImportExternalSemaphore ,cuSignalExternalSemaphoresAsync ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -291,6 +294,14 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted int m_version; }; +#define ASSERT_CUDA_SUCCESS(expr, handler) \ + do { \ + const auto cudaResult = (expr); \ + if (!((handler)->defaultHandleResult(cudaResult))) { \ + assert(false); \ + } \ + } while(0) + } #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 25c93222da..83cbb2573a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -19,9 +19,10 @@ CCUDADevice::CCUDADevice( const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler) : + m_logger(vulkanDevice->getDebugCallback()->getLogger()), m_defaultCompileOptions(), m_vulkanConnection(std::move(vulkanConnection)), - m_vulkanDevice(vulkanDevice), + m_physicalDevice(vulkanDevice), m_virtualArchitecture(virtualArchitecture), m_handle(device), m_handler(std::move(handler)), @@ -32,19 +33,15 @@ CCUDADevice::CCUDADevice( m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = m_handler->getCUDAFunctionTable(); - CUresult re = cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle); - assert(CUDA_SUCCESS == re); - re = cu.pcuCtxSetCurrent(m_context); - assert(CUDA_SUCCESS == re); + ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler); - for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) + for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { - const auto prop = getMemAllocationProp(static_cast(i)); - auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); - - assert(CUDA_SUCCESS == re); + const auto prop = getMemAllocationProp(static_cast(locationType)); + ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); } } @@ -55,15 +52,15 @@ size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const { - auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = m_handler->getCUDAFunctionTable(); CUdeviceptr ptr = 0; - if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) + if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) + if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - cu.pcuMemAddressFree(ptr, size); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); return err; } @@ -74,8 +71,8 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - cu.pcuMemUnmap(ptr, size); - cu.pcuMemAddressFree(ptr, size); + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); return err; } @@ -114,35 +111,44 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) + { + m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); return nullptr; + } if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { - cu.pcuMemRelease(mem); + m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); return nullptr; } - if (auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - CloseExternalHandle(params.externalHandle); - cu.pcuMemRelease(mem); + m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); + + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + + bool closeSucceed = CloseExternalHandle(params.externalHandle); + assert(closeSucceed); + return nullptr; } - if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) + if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - CloseExternalHandle(params.externalHandle); + bool closeSucceed = CloseExternalHandle(params.externalHandle); + assert(closeSucceed); return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params)); } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - - auto& cu = m_handler->getCUDAFunctionTable(); - auto handleType = mem->getCreationParams().externalHandleType; + const auto& cu = m_handler->getCUDAFunctionTable(); + const auto handleType = mem->getCreationParams().externalHandleType; if (!handleType) return nullptr; @@ -159,8 +165,11 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co extMemDesc.size = mem->getAllocationSize(); CUexternalMemory cuExtMem; - if (auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + if (const auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + { + m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); return nullptr; + } return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); } @@ -185,15 +194,18 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CUexternalSemaphore cusema; - if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + if (const auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + { + m_logger.log("Fail to import semaphore into CUDA!"); return nullptr; + } return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); } CCUDADevice::~CCUDADevice() { - m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); + ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler); } } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index e6c6b67509..ae64cf777f 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -40,12 +40,14 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM CCUDAExportableMemory::~CCUDAExportableMemory() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - cu.pcuMemUnmap(m_params.ptr, m_params.granularSize); - cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize); + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler()); - CloseExternalHandle(m_params.externalHandle); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler()); + + bool closeSucceed = CloseExternalHandle(m_params.externalHandle); + assert(closeSucceed); } } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 33ba43eb28..7e21b05ef1 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -24,8 +24,7 @@ CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) CCUDAImportedMemory::~CCUDAImportedMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (cu.pcuDestroyExternalMemory(m_handle) != CUDA_SUCCESS) - assert(!"Invalid code path"); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler()); } } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 69b851088e..0dc750a4a9 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -11,8 +11,7 @@ namespace nbl::video CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) - assert(!"Invalid code path."); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler()); } } From 129ceaca26a7c5eeebdb64d6fb355fbbb2113dc6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 13:06:33 +0700 Subject: [PATCH 078/149] Revert 6605bebf changes in tgmath impl.hlsl --- include/nbl/builtin/hlsl/tgmath/impl.hlsl | 63 ++++++++++++----------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl index 4d1a30c757..0c1dc2f458 100644 --- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl +++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl @@ -197,12 +197,12 @@ struct erf_helper(NBL_FP64_LITERAL(0.254829592)); + const FloatingPoint a2 = _static_cast(NBL_FP64_LITERAL(-0.284496736)); + const FloatingPoint a3 = _static_cast(NBL_FP64_LITERAL(1.421413741)); + const FloatingPoint a4 = _static_cast(NBL_FP64_LITERAL(-1.453152027)); + const FloatingPoint a5 = _static_cast(NBL_FP64_LITERAL(1.061405429)); + const FloatingPoint p = _static_cast(NBL_FP64_LITERAL(0.3275911)); FloatingPoint _sign = FloatingPoint(sign(_x)); FloatingPoint x = abs(_x); @@ -393,10 +393,10 @@ struct erf_helper static float16_t __call(float16_t _x) { // A&S approximation to 2.5x10-5 - const float16_t a1 = float16_t(0.3480242f); - const float16_t a2 = float16_t(-0.0958798f); - const float16_t a3 = float16_t(0.7478556f); - const float16_t p = float16_t(0.47047f); + const float16_t a1 = _static_cast(0.3480242f); + const float16_t a2 = _static_cast(-0.0958798f); + const float16_t a3 = _static_cast(0.7478556f); + const float16_t p = _static_cast(0.47047f); float16_t _sign = float16_t(sign(_x)); float16_t x = abs_helper::__call(_x); @@ -414,35 +414,36 @@ struct erfInv_helper(_x, FloatingPoint(NBL_FP64_LITERAL(-0.99999)), FloatingPoint(NBL_FP64_LITERAL(0.99999))); + // TODO: maybe need to replace `FloatingPoint(NBL_FP64_LITERAL` with `_static_cast(NBL_FP64_LITERAL` to make DXC shut up + FloatingPoint x = clamp(_x, _static_cast(NBL_FP64_LITERAL(-0.99999)), _static_cast(NBL_FP64_LITERAL(0.99999))); - FloatingPoint w = -log_helper::__call((FloatingPoint(NBL_FP64_LITERAL(1.0)) - x) * (FloatingPoint(NBL_FP64_LITERAL(1.0)) + x)); + FloatingPoint w = -log_helper::__call((_static_cast(NBL_FP64_LITERAL(1.0)) - x) * (_static_cast(NBL_FP64_LITERAL(1.0)) + x)); FloatingPoint p; if (w < 5.0) { - w -= FloatingPoint(NBL_FP64_LITERAL(2.5)); - p = FloatingPoint(NBL_FP64_LITERAL(2.81022636e-08)); - p = FloatingPoint(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00021858087)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00125372503)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00417768164)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.246640727)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.50140941)) + p * w; + w -= _static_cast(NBL_FP64_LITERAL(2.5)); + p = _static_cast(NBL_FP64_LITERAL(2.81022636e-08)); + p = _static_cast(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00021858087)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00125372503)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00417768164)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.246640727)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.50140941)) + p * w; } else { w = sqrt_helper::__call(w) - FloatingPoint(NBL_FP64_LITERAL(3.0)); - p = FloatingPoint(NBL_FP64_LITERAL(-0.000200214257)); - p = FloatingPoint(NBL_FP64_LITERAL(0.000100950558)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00134934322)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00367342844)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00573950773)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.0076224613)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00943887047)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.00167406)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(2.83297682)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.000200214257)); + p = _static_cast(NBL_FP64_LITERAL(0.000100950558)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00134934322)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00367342844)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00573950773)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.0076224613)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00943887047)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.00167406)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(2.83297682)) + p * w; } return p * x; } From 2c084646347d3d9ee446572ebf039acbe89b33c6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 13:20:14 +0700 Subject: [PATCH 079/149] Fix indentation in IDeviceMemoryAllocator.h --- include/nbl/video/IDeviceMemoryAllocator.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 797536113c..019fbd9358 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -47,11 +47,11 @@ class NBL_API2 IDeviceMemoryAllocator core::bitflag allocateFlags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, external_handle_t handle) : - m_allocateFlags(static_cast(allocateFlags.value)), - m_reqs(reqs), - m_handleType(handleType), + m_allocateFlags(static_cast(allocateFlags.value)), + m_reqs(reqs), + m_handleType(handleType), m_handle(handle) - {} + {} static inline uint32_t end() {return 32u;} @@ -96,7 +96,7 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, external_handle_t handle ) : - IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) + IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } From e9937576030919e49803540524e59be7bbe7d078 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 14:17:32 +0700 Subject: [PATCH 080/149] Turn off NBL_COMPILE_WITH_CUDA by default --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5be1855959..fa74e167f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,8 +70,7 @@ else() message(STATUS "Vulkan SDK is not found") endif() -# TODO(kevinyu): Turn off this flag after I finish developing the PR. -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" ON) +option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) if(NBL_COMPILE_WITH_CUDA) find_package(CUDAToolkit REQUIRED) From dcf05522a1b1a04a594a9cf16bad59132a622456 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 15:04:41 +0700 Subject: [PATCH 081/149] Move CCUDAHandler constructor from protected to public --- include/nbl/video/CCUDAHandler.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 602637f202..61e9522a66 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -158,6 +158,8 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ); const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;} + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + // inline core::SRange getSTDHeaders() { @@ -261,7 +263,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); ~CCUDAHandler() = default; From f6bf98938bb975fe056ff3b4284900e5dc77b4b8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 23:22:38 +0700 Subject: [PATCH 082/149] Fix crash due to dangling win32metadata --- include/nbl/video/CCUDADevice.h | 2 -- src/nbl/video/CCUDADevice.cpp | 39 ++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index f19a7fdae6..02f85fdac8 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -100,8 +100,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted private: CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; - CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; - static constexpr auto CudaMemoryLocationCount = 5; const system::logger_opt_ptr m_logger; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 83cbb2573a..27f8f6f906 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -40,7 +40,21 @@ CCUDADevice::CCUDADevice( for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { - const auto prop = getMemAllocationProp(static_cast(locationType)); + + #ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; + #endif + + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = static_cast(locationType), .id = m_handle }, + #ifdef _WIN32 + .win32HandleMetaData = &metadata, + #endif + }; ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); } } @@ -81,32 +95,27 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } -CUmemAllocationProp CCUDADevice::getMemAllocationProp(CUmemLocationType locationType) const +core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) { + CCUDAExportableMemory::SCachedCreationParams params = { inParams }; + + auto& cu = m_handler->getCUDAFunctionTable(); #ifdef _WIN32 - OBJECT_ATTRIBUTES metadata = {}; - metadata.Length = sizeof(OBJECT_ATTRIBUTES); + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; #endif - return { + const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = locationType, .id = m_handle }, + .location = { .type = params.location, .id = m_handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; -} -core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) -{ - CCUDAExportableMemory::SCachedCreationParams params = { inParams }; - - auto& cu = m_handler->getCUDAFunctionTable(); - - const auto prop = getMemAllocationProp(params.location); - params.granularSize = roundToGranularity(params.location, params.size); CUmemGenericAllocationHandle mem; From 0d237c08b5e183074d6307e57e21f8e86546fcdd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 4 May 2026 14:25:19 +0700 Subject: [PATCH 083/149] Implement vk flag for HOST_NUMA and HOST_NUMA_CURRENT --- src/nbl/video/CCUDAExportableMemory.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index ae64cf777f..66cbbdcf4f 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -17,11 +17,10 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM switch (m_params.location) { - case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; - // TODO(Atil): Figure out how to handle these case CU_MEM_LOCATION_TYPE_HOST_NUMA: case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; default: break; } From f4ce3dc140f5d3abee707f853574bb12bf620131 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 09:28:24 +0200 Subject: [PATCH 084/149] Move CUDA interop behind extension target --- CMakeLists.txt | 15 +-- cmake/FindZLIB.cmake | 4 +- examples_tests | 2 +- .../{video => ext/CUDAInterop}/CCUDADevice.h | 13 ++- .../CUDAInterop}/CCUDAExportableMemory.h | 6 +- .../{video => ext/CUDAInterop}/CCUDAHandler.h | 70 ++++++++++++- .../CUDAInterop}/CCUDAImportedMemory.h | 16 +-- .../CUDAInterop}/CCUDAImportedSemaphore.h | 6 +- include/nbl/ext/CUDAInterop/CUDAInterop.h | 9 ++ include/nbl/ext/OptiX/IDenoiser.h | 4 +- include/nbl/system/DefaultFuncPtrLoader.h | 4 +- include/nbl/video/EApiType.h | 6 ++ include/nbl/video/declarations.h | 5 +- src/nbl/CMakeLists.txt | 23 +---- src/nbl/ext/CMakeLists.txt | 12 +++ .../CUDAInterop}/CCUDADevice.cpp | 5 +- .../CUDAInterop}/CCUDAExportableMemory.cpp | 7 +- .../CUDAInterop}/CCUDAHandler.cpp | 20 +++- .../CUDAInterop}/CCUDAImportedMemory.cpp | 7 +- .../CUDAInterop}/CCUDAImportedSemaphore.cpp | 7 +- src/nbl/ext/CUDAInterop/CMakeLists.txt | 46 +++++++++ src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 35 +++++++ src/nbl/ext/CUDAInterop/smoke/opt_in.cpp | 97 +++++++++++++++++++ .../ext/CUDAInterop/smoke/public_boundary.cpp | 15 +++ 24 files changed, 366 insertions(+), 68 deletions(-) rename include/nbl/{video => ext/CUDAInterop}/CCUDADevice.h (93%) rename include/nbl/{video => ext/CUDAInterop}/CCUDAExportableMemory.h (93%) rename include/nbl/{video => ext/CUDAInterop}/CCUDAHandler.h (78%) rename include/nbl/{video => ext/CUDAInterop}/CCUDAImportedMemory.h (74%) rename include/nbl/{video => ext/CUDAInterop}/CCUDAImportedSemaphore.h (90%) create mode 100644 include/nbl/ext/CUDAInterop/CUDAInterop.h rename src/nbl/{video => ext/CUDAInterop}/CCUDADevice.cpp (98%) rename src/nbl/{video => ext/CUDAInterop}/CCUDAExportableMemory.cpp (90%) rename src/nbl/{video => ext/CUDAInterop}/CCUDAHandler.cpp (97%) rename src/nbl/{video => ext/CUDAInterop}/CCUDAImportedMemory.cpp (84%) rename src/nbl/{video => ext/CUDAInterop}/CCUDAImportedSemaphore.cpp (71%) create mode 100644 src/nbl/ext/CUDAInterop/CMakeLists.txt create mode 100644 src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt create mode 100644 src/nbl/ext/CUDAInterop/smoke/opt_in.cpp create mode 100644 src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index fa74e167f0..ff90d862ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,9 +70,13 @@ else() message(STATUS "Vulkan SDK is not found") endif() -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) +option(NBL_COMPILE_WITH_CUDA "Build the CUDA interop extension?" OFF) +set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON") if(NBL_COMPILE_WITH_CUDA) + if(NBL_CUDA_TOOLKIT_ROOT) + set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}" CACHE PATH "CUDA Toolkit root" FORCE) + endif() find_package(CUDAToolkit REQUIRED) if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") @@ -183,13 +187,12 @@ option(NBL_BUILD_IMGUI "Enable nbl::ext::ImGui?" ON) option(NBL_BUILD_DEBUG_DRAW "Enable Nabla Debug Draw extension?" ON) option(NBL_BUILD_OPTIX "Enable nbl::ext::OptiX?" OFF) -if(NBL_COMPILE_WITH_CUDA) - find_package(OPTIX REQUIRED) - message(STATUS "CUDA enabled and OptiX found!") -else() - if(NBL_BUILD_OPTIX) +if(NBL_BUILD_OPTIX) + if(NOT NBL_COMPILE_WITH_CUDA) message(FATAL_ERROR "You cannot build Optix without enabled CUDA! NBL_COMPILE_WITH_CUDA must be ON!") endif() + find_package(OPTIX REQUIRED) + message(STATUS "CUDA enabled and OptiX found!") endif() option(NBL_BUILD_BULLET "Enable Bullet Physics building and integration?" OFF) diff --git a/cmake/FindZLIB.cmake b/cmake/FindZLIB.cmake index f855c396b9..42aa789bee 100644 --- a/cmake/FindZLIB.cmake +++ b/cmake/FindZLIB.cmake @@ -4,4 +4,6 @@ endif() set(ZLIB_FOUND TRUE) set(ZLIB_LIBRARY ZLIB::ZLIB) -set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") \ No newline at end of file +set(ZLIB_LIBRARIES ZLIB::ZLIB) +set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") +set(ZLIB_INCLUDE_DIRS "${ZLIB_INCLUDE_DIR}") diff --git a/examples_tests b/examples_tests index 93ca5efe58..cbb24a6404 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 93ca5efe588ca85c1eaf81a486b611df98403580 +Subproject commit cbb24a640442ace7bd01a7987f280ab0b6139e22 diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h similarity index 93% rename from include/nbl/video/CCUDADevice.h rename to include/nbl/ext/CUDAInterop/CCUDADevice.h index 02f85fdac8..d7886a4c53 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h @@ -5,14 +5,13 @@ #define _NBL_VIDEO_C_CUDA_DEVICE_H_ -#include "nbl/video/IPhysicalDevice.h" -#include "nbl/video/CCUDAExportableMemory.h" -#include "nbl/video/CCUDAImportedMemory.h" -#include "nbl/video/CCUDAImportedSemaphore.h" - - #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" +#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h" +#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" +#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h" + #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 @@ -27,7 +26,7 @@ namespace nbl::video { class CCUDAHandler; -class NBL_API2 CCUDADevice : public core::IReferenceCounted +class CCUDADevice : public core::IReferenceCounted { public: #ifdef _WIN32 diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h similarity index 93% rename from include/nbl/video/CCUDAExportableMemory.h rename to include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h index 1c3d206906..10bf911717 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h @@ -7,6 +7,8 @@ #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" + #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 @@ -22,7 +24,7 @@ namespace nbl::video class CCUDADevice; -class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted +class CCUDAExportableMemory : public core::IReferenceCounted { public: @@ -62,4 +64,4 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted #endif // _NBL_COMPILE_WITH_CUDA_ -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h similarity index 78% rename from include/nbl/video/CCUDAHandler.h rename to include/nbl/ext/CUDAInterop/CCUDAHandler.h index 61e9522a66..8c86d9102c 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h @@ -9,7 +9,7 @@ #include "nbl/system/declarations.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDADevice.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -17,7 +17,7 @@ namespace nbl::video { -class NBL_API2 CCUDAHandler : public core::IReferenceCounted +class CCUDAHandler : public core::IReferenceCounted { public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); @@ -151,6 +151,8 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted nvrtcCreateProgram, nvrtcDestroyProgram, nvrtcGetLoweredName, + nvrtcGetCUBIN, + nvrtcGetCUBINSize, nvrtcGetPTX, nvrtcGetPTXSize, nvrtcGetProgramLog, @@ -216,6 +218,13 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted }; ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog); + struct cubin_and_nvrtcResult_t + { + core::smart_refctd_ptr cubin; + nvrtcResult result; + }; + cubin_and_nvrtcResult_t getCUBIN(nvrtcProgram prog); + // inline ptx_and_nvrtcResult_t compileDirectlyToPTX( std::string&& source, const char* filename, core::SRange nvrtcOptions, @@ -260,6 +269,49 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); } + inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN( + std::string&& source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr + ) + { + nvrtcProgram program = nullptr; + nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + auto cleanup = core::makeRAIIExiter([&]() -> void + { + if (result!=NVRTC_SUCCESS && program) + m_nvrtc.pnvrtcDestroyProgram(&program); + }); + + result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); + return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log); + } + inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN( + const char* source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr + ) + { + return compileDirectlyToCUBIN(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); + } + inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN( + system::IFile* file, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr + ) + { + nvrtcProgram program = nullptr; + nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + auto cleanup = core::makeRAIIExiter([&]() -> void + { + if (result!=NVRTC_SUCCESS && program) + m_nvrtc.pnvrtcDestroyProgram(&program); + }); + + result = createProgram(&program,file,headerCount,headerContents,includeNames); + return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log); + } + core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: @@ -281,6 +333,20 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted return getPTX(program); } + inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) + { + if (result!=NVRTC_SUCCESS) + return {nullptr,result}; + + result = compileProgram(program,nvrtcOptions); + if (log) + getProgramLog(program,*log); + if (result!=NVRTC_SUCCESS) + return {nullptr,result}; + + return getCUBIN(program); + } + // function tables CUDA m_cuda; NVRTC m_nvrtc; diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h similarity index 74% rename from include/nbl/video/CCUDAImportedMemory.h rename to include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h index 4e3bfcd085..5f885abd2d 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h @@ -1,20 +1,22 @@ -#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H -#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H +#ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_ +#define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_ #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" + #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 #error "Need CUDA 9.0 SDK or higher." #endif -#endif // _NBL_COMPILE_WITH_CUDA - namespace nbl::video { -class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted +class CCUDADevice; + +class CCUDAImportedMemory : public core::IReferenceCounted { public: @@ -39,4 +41,6 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted } -#endif \ No newline at end of file +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h similarity index 90% rename from include/nbl/video/CCUDAImportedSemaphore.h rename to include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h index 2e5010fa2d..409ef1a676 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h @@ -6,6 +6,8 @@ #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "nbl/video/declarations.h" + #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 @@ -19,7 +21,9 @@ namespace nbl::video { -class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted +class CCUDADevice; + +class CCUDAImportedSemaphore : public core::IReferenceCounted { public: diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h new file mode 100644 index 0000000000..b30d096049 --- /dev/null +++ b/include/nbl/ext/CUDAInterop/CUDAInterop.h @@ -0,0 +1,9 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_ +#define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_ + +#include "nbl/ext/CUDAInterop/CCUDAHandler.h" + +#endif diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h index 7820aa1222..496383d92d 100644 --- a/include/nbl/ext/OptiX/IDenoiser.h +++ b/include/nbl/ext/OptiX/IDenoiser.h @@ -5,7 +5,7 @@ #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ -#include "../../../../src/nbl/video/CCUDAHandler.h" +#include "nbl/ext/CUDAInterop/CCUDAHandler.h" #include #include @@ -122,4 +122,4 @@ class IDenoiser final : public core::IReferenceCounted } } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h index 56142448c8..bbb9884e7a 100644 --- a/include/nbl/system/DefaultFuncPtrLoader.h +++ b/include/nbl/system/DefaultFuncPtrLoader.h @@ -35,9 +35,9 @@ class DefaultFuncPtrLoader final : FuncPtrLoader return lib!=nullptr; } - void* loadFuncPtr(const char* funcname) override final; + NBL_API2 void* loadFuncPtr(const char* funcname) override final; }; } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 7f99d40309..db29abe54d 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -4,6 +4,12 @@ #include "nbl/core/declarations.h" #include +#ifdef _WIN32 +#include +#else +#include +#endif + namespace nbl::video { diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index 37f2f864bf..4393af1768 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -24,9 +24,6 @@ #include "nbl/video/CVulkanImage.h" #include "nbl/video/surface/CSurfaceVulkan.h" -// CUDA -#include "nbl/video/CCUDAHandler.h" - // utilities #include "nbl/video/utilities/CDumbPresentationOracle.h" #include "nbl/video/utilities/ICommandPoolCache.h" @@ -44,4 +41,4 @@ //#include "nbl/video/IGPUVirtualTexture.h" -#endif \ No newline at end of file +#endif diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 692efec8bd..de9bde3952 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -95,12 +95,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (NBL_COMPILE_WITH_CUDA) - message(STATUS "Building with CUDA interop") - set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) - if (NBL_BUILD_OPTIX) - set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) - endif() +if (NBL_BUILD_OPTIX) + set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() # => TODO: clean! @@ -291,12 +287,6 @@ set(NBL_VIDEO_SOURCES video/CVulkanEvent.cpp video/CSurfaceVulkan.cpp -# CUDA - video/CCUDAHandler.cpp - video/CCUDADevice.cpp - video/CCUDAImportedSemaphore.cpp - video/CCUDAExportableMemory.cpp - video/CCUDAImportedMemory.cpp ) set(NBL_SCENE_SOURCES @@ -425,10 +415,6 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() -if(NBL_COMPILE_WITH_CUDA) - target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) -endif() - set(INTERFACE_BUILD_DEFINITIONS _DXC_DLL_="${DXC_DLL}" ) @@ -664,11 +650,6 @@ target_link_libraries(Nabla PRIVATE volk) # volk is part of public interface headers in Nabla target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) -# CUDA -if (NBL_COMPILE_WITH_CUDA) - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") -endif() - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS # this should be PRIVATE, but things from /src (or /source) are sometimes included in things in /include and so examples have to put source dirs into theirs Include Path # -> TODO diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt index f3b55531c2..1f815413e8 100644 --- a/src/nbl/ext/CMakeLists.txt +++ b/src/nbl/ext/CMakeLists.txt @@ -38,6 +38,18 @@ if (NBL_BUILD_OPTIX) ) endif() +add_subdirectory(CUDAInterop) +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS + ${NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS} + PARENT_SCOPE + ) + set(NBL_EXT_CUDA_INTEROP_LIB + ${NBL_EXT_CUDA_INTEROP_LIB} + PARENT_SCOPE + ) +endif() + if (NBL_BUILD_IMGUI) add_subdirectory(ImGui) set(NBL_EXT_IMGUI_UI_INCLUDE_DIRS diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp similarity index 98% rename from src/nbl/video/CCUDADevice.cpp rename to src/nbl/ext/CUDAInterop/CCUDADevice.cpp index 27f8f6f906..aa06c6e7bf 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp @@ -1,13 +1,14 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAHandler.h" #ifdef _WIN32 #include #endif -#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp similarity index 90% rename from src/nbl/video/CCUDAExportableMemory.cpp rename to src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp index 66cbbdcf4f..65afdca660 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp @@ -2,8 +2,9 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAExportableMemory.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h" +#include "nbl/ext/CUDAInterop/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAHandler.h" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video @@ -51,4 +52,4 @@ CCUDAExportableMemory::~CCUDAExportableMemory() } } -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp similarity index 97% rename from src/nbl/video/CCUDAHandler.cpp rename to src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index 060afe6631..f9048d3bb6 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -2,7 +2,7 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAHandler.h" +#include "nbl/ext/CUDAInterop/CCUDAHandler.h" #include "nbl/system/CFileView.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -488,7 +488,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste {\ if (!cuda.p ## FUNC)\ return nullptr;\ - auto result = cuda.p ## FUNC ## (__VA_ARGS__);\ + auto result = cuda.p ## FUNC(__VA_ARGS__);\ if (result!=CUDA_SUCCESS)\ return nullptr;\ } @@ -570,6 +570,22 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } +CCUDAHandler::cubin_and_nvrtcResult_t CCUDAHandler::getCUBIN(nvrtcProgram prog) +{ + size_t _size = 0ull; + nvrtcResult sizeRes = m_nvrtc.pnvrtcGetCUBINSize(prog,&_size); + if (sizeRes!=NVRTC_SUCCESS) + return {nullptr,sizeRes}; + if (_size==0ull) + return {nullptr,NVRTC_ERROR_INVALID_INPUT}; + + asset::ICPUBuffer::SCreationParams cubinParams = {}; + cubinParams.size = _size; + auto cubin = asset::ICPUBuffer::create(std::move(cubinParams)); + auto cubinPtr = static_cast(cubin->getPointer()); + return {std::move(cubin),m_nvrtc.pnvrtcGetCUBIN(prog,cubinPtr)}; +} + core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) { if (!vulkanConnection) diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp similarity index 84% rename from src/nbl/video/CCUDAImportedMemory.cpp rename to src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp index 7e21b05ef1..a785bad9b9 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp @@ -2,8 +2,9 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAImportedMemory.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" +#include "nbl/ext/CUDAInterop/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAHandler.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -29,4 +30,4 @@ CCUDAImportedMemory::~CCUDAImportedMemory() } -#endif \ No newline at end of file +#endif diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp similarity index 71% rename from src/nbl/video/CCUDAImportedSemaphore.cpp rename to src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp index 0dc750a4a9..1ca4a34190 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp @@ -2,8 +2,9 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAImportedSemaphore.h" -#include "nbl/video/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h" +#include "nbl/ext/CUDAInterop/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAHandler.h" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video @@ -15,4 +16,4 @@ CCUDAImportedSemaphore::~CCUDAImportedSemaphore() } } -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt new file mode 100644 index 0000000000..d3f8e85169 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -0,0 +1,46 @@ +include(${NBL_ROOT_PATH}/cmake/common.cmake) + +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_INTERNAL_INCLUDE_DIR "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop") + + set(NBL_EXT_CUDA_INTEROP_H + ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h + ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h + ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h + ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h + ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedMemory.h + ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedSemaphore.h + ) + + set(NBL_EXT_CUDA_INTEROP_SRC + CCUDADevice.cpp + CCUDAExportableMemory.cpp + CCUDAHandler.cpp + CCUDAImportedMemory.cpp + CCUDAImportedSemaphore.cpp + ) + + nbl_create_ext_library_project( + CUDA_INTEROP + "${NBL_EXT_CUDA_INTEROP_H}" + "${NBL_EXT_CUDA_INTEROP_SRC}" + "" + "" + "_NBL_COMPILE_WITH_CUDA_" + ) + + set(NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS) + if(CUDAToolkit_ROOT) + list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS + "${CUDAToolkit_ROOT}/include" + "${CUDAToolkit_ROOT}/include/cccl" + ) + endif() + list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) + list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS) + + target_include_directories(${LIB_NAME} BEFORE PUBLIC ${NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS}) + add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME}) +endif() + +add_subdirectory(smoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt new file mode 100644 index 0000000000..7805153e32 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -0,0 +1,35 @@ +enable_testing() + +set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS) +if(CMAKE_CONFIGURATION_TYPES) + set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS --config $) +endif() + +function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE) + add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) + nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug) + + set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$") + if(CUDAToolkit_BIN_DIR) + list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}") + endif() + + add_test( + NAME ${TARGET_NAME}.build + COMMAND ${CMAKE_COMMAND} --build "${CMAKE_CURRENT_BINARY_DIR}" --target ${TARGET_NAME} ${_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS} + ) + add_test(NAME ${TARGET_NAME}.run COMMAND $) + set_tests_properties(${TARGET_NAME}.run PROPERTIES + DEPENDS ${TARGET_NAME}.build + ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}" + ) +endfunction() + +nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp) +target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla) + +if(TARGET Nabla::ext::CUDAInterop) + nbl_add_cuda_interop_smoke(NblExtCUDAInteropOptInSmoke opt_in.cpp) + target_link_libraries(NblExtCUDAInteropOptInSmoke PRIVATE Nabla::ext::CUDAInterop) +endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp new file mode 100644 index 0000000000..d6afab79d2 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp @@ -0,0 +1,97 @@ +#include "nbl/ext/CUDAInterop/CUDAInterop.h" + +#include +#include +#include +#include + +#ifndef _NBL_COMPILE_WITH_CUDA_ +#error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop." +#endif + +namespace +{ +using namespace nbl; +using namespace nbl::video; + +[[maybe_unused]] bool compileVulkanCudaInteropRecipe( + CCUDADevice& cudaDevice, + ILogicalDevice* vulkanDevice, + core::smart_refctd_ptr vulkanMemory, + core::smart_refctd_ptr vulkanSemaphore) +{ + auto cudaMemory = cudaDevice.createExportableMemory({ + .size = 4096, + .alignment = 4096, + .location = CU_MEM_LOCATION_TYPE_DEVICE, + }); + if (!cudaMemory) + return false; + + auto exportedToVulkan = cudaMemory->exportAsMemory(vulkanDevice); + auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory)); + auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore)); + + CUdeviceptr mappedVulkanMemory = 0; + if (importedFromVulkan) + importedFromVulkan->getMappedBuffer(&mappedVulkanMemory); + + const CUexternalSemaphore cudaSemaphore = importedSemaphore ? importedSemaphore->getInternalObject():nullptr; + return exportedToVulkan.get() && mappedVulkanMemory && cudaSemaphore; +} + +bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) +{ + auto& cuda = handler.getCUDAFunctionTable(); + + CUcontext context = nullptr; + if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) + return false; + + CUcontext poppedContext = nullptr; + auto releaseContext = [&]() + { + if (context) + { + cuda.pcuCtxPopCurrent_v2(&poppedContext); + cuda.pcuDevicePrimaryCtxRelease_v2(device); + } + }; + + if (cuda.pcuCtxPushCurrent_v2(context)!=CUDA_SUCCESS) + { + releaseContext(); + return false; + } + + constexpr std::array input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u}; + std::array output = {}; + + CUdeviceptr deviceMemory = 0; + bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS; + ok = ok && cuda.pcuMemcpyHtoD_v2(deviceMemory, input.data(), sizeof(input))==CUDA_SUCCESS; + ok = ok && cuda.pcuMemcpyDtoH_v2(output.data(), deviceMemory, sizeof(output))==CUDA_SUCCESS; + if (deviceMemory) + ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok; + + releaseContext(); + return ok && std::ranges::equal(input, output); +} +} + +int main() +{ + static_assert(std::is_same_v().getInternalObject()), CUdevice>); + CUdeviceptr devicePtr = 0; + static_cast(devicePtr); + + auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); + if (!handler) + return 0; + + const auto& devices = handler->getAvailableDevices(); + if (devices.empty()) + return 0; + + return cudaDriverRoundtrip(*handler, devices.front().handle) ? 0:1; +} diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp new file mode 100644 index 0000000000..809d1e7b93 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -0,0 +1,15 @@ +#include "nabla.h" +#include "nbl/ext/CUDAInterop/CUDAInterop.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Default Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Default Nabla consumers must not include CUDA SDK headers." +#endif + +int main() +{ + return 0; +} From 78845ae3f2bfb360316aab2f905d0b415165d52c Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 11:43:32 +0200 Subject: [PATCH 085/149] Address CUDA interop review cleanup --- CMakeLists.txt | 2 +- examples_tests | 2 +- include/nbl/ext/CUDAInterop/CCUDAHandler.h | 66 ------------------- include/nbl/ext/CUDAInterop/CUDAInterop.h | 4 ++ include/nbl/system/DefaultFuncPtrLoader.h | 8 +-- include/nbl/video/EApiType.h | 31 +-------- src/nbl/CMakeLists.txt | 1 + src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 16 ----- src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 36 ++++++---- src/nbl/ext/CUDAInterop/smoke/opt_in.cpp | 50 ++++++++++---- .../ext/CUDAInterop/smoke/public_boundary.cpp | 24 ++++++- src/nbl/video/EApiType.cpp | 37 +++++++++++ 12 files changed, 130 insertions(+), 147 deletions(-) create mode 100644 src/nbl/video/EApiType.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ff90d862ce..c5e1bfac20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,7 +75,7 @@ set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NB if(NBL_COMPILE_WITH_CUDA) if(NBL_CUDA_TOOLKIT_ROOT) - set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}" CACHE PATH "CUDA Toolkit root" FORCE) + set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}") endif() find_package(CUDAToolkit REQUIRED) if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") diff --git a/examples_tests b/examples_tests index cbb24a6404..5c604d274b 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit cbb24a640442ace7bd01a7987f280ab0b6139e22 +Subproject commit 5c604d274b8aac99d8855f5b7aaf615910c8a5f6 diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h index 8c86d9102c..5128aad575 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h +++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h @@ -151,8 +151,6 @@ class CCUDAHandler : public core::IReferenceCounted nvrtcCreateProgram, nvrtcDestroyProgram, nvrtcGetLoweredName, - nvrtcGetCUBIN, - nvrtcGetCUBINSize, nvrtcGetPTX, nvrtcGetPTXSize, nvrtcGetProgramLog, @@ -218,13 +216,6 @@ class CCUDAHandler : public core::IReferenceCounted }; ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog); - struct cubin_and_nvrtcResult_t - { - core::smart_refctd_ptr cubin; - nvrtcResult result; - }; - cubin_and_nvrtcResult_t getCUBIN(nvrtcProgram prog); - // inline ptx_and_nvrtcResult_t compileDirectlyToPTX( std::string&& source, const char* filename, core::SRange nvrtcOptions, @@ -269,49 +260,6 @@ class CCUDAHandler : public core::IReferenceCounted return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); } - inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN( - std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); - }); - - result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); - return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log); - } - inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN( - const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - return compileDirectlyToCUBIN(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); - } - inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN( - system::IFile* file, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); - }); - - result = createProgram(&program,file,headerCount,headerContents,includeNames); - return compileDirectlyToCUBIN_impl(result,program,nvrtcOptions,log); - } - core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: @@ -333,20 +281,6 @@ class CCUDAHandler : public core::IReferenceCounted return getPTX(program); } - inline cubin_and_nvrtcResult_t compileDirectlyToCUBIN_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) - { - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - result = compileProgram(program,nvrtcOptions); - if (log) - getProgramLog(program,*log); - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - return getCUBIN(program); - } - // function tables CUDA m_cuda; NVRTC m_nvrtc; diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h index b30d096049..06d9016dc8 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInterop.h +++ b/include/nbl/ext/CUDAInterop/CUDAInterop.h @@ -4,6 +4,10 @@ #ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_ #define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_ +#include "nbl/ext/CUDAInterop/CCUDADevice.h" +#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h" #include "nbl/ext/CUDAInterop/CCUDAHandler.h" +#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" +#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h" #endif diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h index bbb9884e7a..10fab3a454 100644 --- a/include/nbl/system/DefaultFuncPtrLoader.h +++ b/include/nbl/system/DefaultFuncPtrLoader.h @@ -11,18 +11,18 @@ namespace nbl::system { -class DefaultFuncPtrLoader final : FuncPtrLoader +class NBL_API2 DefaultFuncPtrLoader final : FuncPtrLoader { void* lib; public: inline DefaultFuncPtrLoader() : lib(nullptr) {} - NBL_API2 DefaultFuncPtrLoader(const char* name); + DefaultFuncPtrLoader(const char* name); inline DefaultFuncPtrLoader(DefaultFuncPtrLoader&& other) : DefaultFuncPtrLoader() { operator=(std::move(other)); } - NBL_API2 ~DefaultFuncPtrLoader(); + ~DefaultFuncPtrLoader(); inline DefaultFuncPtrLoader& operator=(DefaultFuncPtrLoader&& other) { @@ -35,7 +35,7 @@ class DefaultFuncPtrLoader final : FuncPtrLoader return lib!=nullptr; } - NBL_API2 void* loadFuncPtr(const char* funcname) override final; + void* loadFuncPtr(const char* funcname) override final; }; } diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index db29abe54d..44a31ecf90 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -4,12 +4,6 @@ #include "nbl/core/declarations.h" #include -#ifdef _WIN32 -#include -#else -#include -#endif - namespace nbl::video { @@ -34,29 +28,8 @@ constexpr external_handle_t ExternalHandleNull = nullptr; constexpr external_handle_t ExternalHandleNull = -1; #endif -inline bool CloseExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - return CloseHandle(handle); -#else - return (close(handle) == 0); -#endif -} - -inline external_handle_t DuplicateExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - HANDLE re = ExternalHandleNull; - - const HANDLE cur = GetCurrentProcess(); - if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) - return ExternalHandleNull; - - return re; -#else - return dup(handle); -#endif -} +NBL_API2 bool CloseExternalHandle(external_handle_t handle); +NBL_API2 external_handle_t DuplicateExternalHandle(external_handle_t handle); } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index de9bde3952..acbf4d4dda 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -248,6 +248,7 @@ set(NBL_VIDEO_SOURCES video/IGPUAccelerationStructure.cpp video/IGPUCommandBuffer.cpp video/IQueue.cpp + video/EApiType.cpp video/IGPUDescriptorSet.cpp video/IDeviceMemoryAllocation.cpp video/IDeviceMemoryBacked.cpp diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index f9048d3bb6..748a88d1a1 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -570,22 +570,6 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } -CCUDAHandler::cubin_and_nvrtcResult_t CCUDAHandler::getCUBIN(nvrtcProgram prog) -{ - size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetCUBINSize(prog,&_size); - if (sizeRes!=NVRTC_SUCCESS) - return {nullptr,sizeRes}; - if (_size==0ull) - return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - - asset::ICPUBuffer::SCreationParams cubinParams = {}; - cubinParams.size = _size; - auto cubin = asset::ICPUBuffer::create(std::move(cubinParams)); - auto cubinPtr = static_cast(cubin->getPointer()); - return {std::move(cubin),m_nvrtc.pnvrtcGetCUBIN(prog,cubinPtr)}; -} - core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) { if (!vulkanConnection) diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index 7805153e32..678cd29d84 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -1,27 +1,35 @@ -enable_testing() - -set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS) -if(CMAKE_CONFIGURATION_TYPES) - set(_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS --config $) +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + cmake_minimum_required(VERSION 3.30) + project(NblExtCUDAInteropSmoke CXX) + find_package(Nabla REQUIRED CONFIG) endif() +enable_testing() + function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE) - add_executable(${TARGET_NAME} EXCLUDE_FROM_ALL ${SOURCE_FILE}) + add_executable(${TARGET_NAME} ${SOURCE_FILE}) target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) - nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug) + if(MSVC) + target_compile_options(${TARGET_NAME} PRIVATE + /Gm- + /bigobj + /Zc:wchar_t + /Zc:preprocessor + /Zc:inline + /Zc:forScope + ) + endif() + if(COMMAND nbl_adjust_flags) + nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug) + endif() set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$") if(CUDAToolkit_BIN_DIR) list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}") endif() - add_test( - NAME ${TARGET_NAME}.build - COMMAND ${CMAKE_COMMAND} --build "${CMAKE_CURRENT_BINARY_DIR}" --target ${TARGET_NAME} ${_NBL_CUDA_INTEROP_SMOKE_CONFIG_ARGS} - ) - add_test(NAME ${TARGET_NAME}.run COMMAND $) - set_tests_properties(${TARGET_NAME}.run PROPERTIES - DEPENDS ${TARGET_NAME}.build + add_test(NAME ${TARGET_NAME} COMMAND $) + set_tests_properties(${TARGET_NAME} PROPERTIES ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}" ) endfunction() diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp index d6afab79d2..adcb48e6de 100644 --- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp @@ -1,9 +1,11 @@ #include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/system/IApplicationFramework.h" #include #include #include #include +#include #ifndef _NBL_COMPILE_WITH_CUDA_ #error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop." @@ -69,8 +71,10 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) CUdeviceptr deviceMemory = 0; bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS; - ok = ok && cuda.pcuMemcpyHtoD_v2(deviceMemory, input.data(), sizeof(input))==CUDA_SUCCESS; - ok = ok && cuda.pcuMemcpyDtoH_v2(output.data(), deviceMemory, sizeof(output))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyHtoD_v2(deviceMemory,input.data(),sizeof(input))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyDtoH_v2(output.data(),deviceMemory,sizeof(output))==CUDA_SUCCESS; if (deviceMemory) ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok; @@ -79,19 +83,37 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) } } -int main() +class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework { - static_assert(std::is_same_v().getInternalObject()), CUdevice>); - CUdeviceptr devicePtr = 0; - static_cast(devicePtr); + using base_t = nbl::system::IApplicationFramework; - auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); - if (!handler) - return 0; +public: + using base_t::base_t; - const auto& devices = handler->getAvailableDevices(); - if (devices.empty()) - return 0; + bool onAppInitialized(nbl::core::smart_refctd_ptr&& system) override + { + static_cast(system); - return cudaDriverRoundtrip(*handler, devices.front().handle) ? 0:1; -} + if (!isAPILoaded()) + return false; + + static_assert(std::is_same_v().getInternalObject()), CUdevice>); + CUdeviceptr devicePtr = 0; + static_cast(devicePtr); + + auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); + if (!handler) + return true; + + const auto& devices = handler->getAvailableDevices(); + if (devices.empty()) + return true; + + return cudaDriverRoundtrip(*handler, devices.front().handle); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +NBL_MAIN_FUNC(CUDAInteropOptInSmoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp index 809d1e7b93..c39ba076d4 100644 --- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -1,4 +1,5 @@ #include "nabla.h" +#include "nbl/system/IApplicationFramework.h" #include "nbl/ext/CUDAInterop/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -9,7 +10,26 @@ #error "Default Nabla consumers must not include CUDA SDK headers." #endif -int main() +namespace { - return 0; + +class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&& system) override + { + static_cast(system); + return isAPILoaded(); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + } + +NBL_MAIN_FUNC(CUDAInteropPublicBoundarySmoke) diff --git a/src/nbl/video/EApiType.cpp b/src/nbl/video/EApiType.cpp new file mode 100644 index 0000000000..d7eadd8b08 --- /dev/null +++ b/src/nbl/video/EApiType.cpp @@ -0,0 +1,37 @@ +#include "nbl/video/EApiType.h" + +#ifdef _WIN32 +#define WIN32_LEAN_AND_MEAN +#include +#else +#include +#endif + +namespace nbl::video +{ + +bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return close(handle)==0; +#endif +} + +external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE duplicated = ExternalHandleNull; + + const HANDLE process = GetCurrentProcess(); + if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return duplicated; +#else + return dup(handle); +#endif +} + +} From ab9a7e560fadaf960a1a9f4879a02f6e66833d2a Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 12:04:30 +0200 Subject: [PATCH 086/149] Simplify CUDA interop smoke CMake --- src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index 678cd29d84..89dd821add 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -1,6 +1,9 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) cmake_minimum_required(VERSION 3.30) project(NblExtCUDAInteropSmoke CXX) +endif() + +if(NOT TARGET Nabla::Nabla) find_package(Nabla REQUIRED CONFIG) endif() @@ -19,19 +22,8 @@ function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE) /Zc:forScope ) endif() - if(COMMAND nbl_adjust_flags) - nbl_adjust_flags(TARGET ${TARGET_NAME} MAP_RELEASE Release MAP_RELWITHDEBINFO RelWithDebInfo MAP_DEBUG Debug) - endif() - - set(_NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:$") - if(CUDAToolkit_BIN_DIR) - list(APPEND _NBL_CUDA_INTEROP_SMOKE_PATH_MODS "PATH=path_list_prepend:${CUDAToolkit_BIN_DIR}") - endif() add_test(NAME ${TARGET_NAME} COMMAND $) - set_tests_properties(${TARGET_NAME} PROPERTIES - ENVIRONMENT_MODIFICATION "${_NBL_CUDA_INTEROP_SMOKE_PATH_MODS}" - ) endfunction() nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp) From bf8eeb3509935dd7f0b5970e87a44ea88bf5a4fb Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 12:27:26 +0200 Subject: [PATCH 087/149] Clean CUDA interop smoke usage requirements --- src/nbl/CMakeLists.txt | 7 +++++-- src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 16 ++-------------- src/nbl/ext/CUDAInterop/smoke/opt_in.cpp | 6 +----- .../ext/CUDAInterop/smoke/public_boundary.cpp | 7 +++---- 4 files changed, 11 insertions(+), 25 deletions(-) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index acbf4d4dda..bb96bdfc80 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -763,8 +763,11 @@ if(TARGET ngfx) ) endif() -# on MSVC it won't compile without this option! -target_compile_options(Nabla PUBLIC $<$:/bigobj>) +# on MSVC it won't compile without these options! +target_compile_options(Nabla PUBLIC + $<$:/bigobj> + $<$:/Zc:preprocessor> +) if(NBL_PCH) target_precompile_headers(Nabla diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index 89dd821add..23dd6d5422 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -1,7 +1,5 @@ -if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) - cmake_minimum_required(VERSION 3.30) - project(NblExtCUDAInteropSmoke CXX) -endif() +cmake_minimum_required(VERSION 3.30) +project(NblExtCUDAInteropSmoke CXX) if(NOT TARGET Nabla::Nabla) find_package(Nabla REQUIRED CONFIG) @@ -12,16 +10,6 @@ enable_testing() function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE) add_executable(${TARGET_NAME} ${SOURCE_FILE}) target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) - if(MSVC) - target_compile_options(${TARGET_NAME} PRIVATE - /Gm- - /bigobj - /Zc:wchar_t - /Zc:preprocessor - /Zc:inline - /Zc:forScope - ) - endif() add_test(NAME ${TARGET_NAME} COMMAND $) endfunction() diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp index adcb48e6de..bc8c8952bd 100644 --- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp @@ -90,16 +90,12 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework public: using base_t::base_t; - bool onAppInitialized(nbl::core::smart_refctd_ptr&& system) override + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override { - static_cast(system); - if (!isAPILoaded()) return false; static_assert(std::is_same_v().getInternalObject()), CUdevice>); - CUdeviceptr devicePtr = 0; - static_cast(devicePtr); auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); if (!handler) diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp index c39ba076d4..4f6cbebfb1 100644 --- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -3,11 +3,11 @@ #include "nbl/ext/CUDAInterop/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ -#error "Default Nabla consumers must not get the CUDA opt-in define." +#error "Nabla consumers must not get the CUDA opt-in define." #endif #ifdef CUDA_VERSION -#error "Default Nabla consumers must not include CUDA SDK headers." +#error "Nabla consumers must not include CUDA SDK headers." #endif namespace @@ -20,9 +20,8 @@ class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFra public: using base_t::base_t; - bool onAppInitialized(nbl::core::smart_refctd_ptr&& system) override + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override { - static_cast(system); return isAPILoaded(); } From f701ac63e83bea4bf743af80a6fe29af81d002c0 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 13:28:07 +0200 Subject: [PATCH 088/149] Export CUDA interop package target --- cmake/NablaConfig.cmake.in | 33 ++++++++++++++++++++ cmake/common.cmake | 19 +++++++++-- src/nbl/CMakeLists.txt | 19 ++++++++++- src/nbl/ext/CUDAInterop/CMakeLists.txt | 5 ++- src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 2 +- 5 files changed, 72 insertions(+), 6 deletions(-) diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index b22b3ad0d7..e88a25b0dd 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -6,6 +6,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso set(_NBL_NABLA_LOAD_CORE OFF) set(_NBL_NABLA_LOAD_NSC OFF) +set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF) set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS}) set(_NBL_NABLA_HAS_CORE_EXPORTS OFF) set(_NBL_NABLA_HAS_NSC_EXPORTS OFF) @@ -25,6 +26,10 @@ if(_NBL_NABLA_COMPONENTS) elseif(_NBL_NABLA_COMPONENT STREQUAL "Core") set(_NBL_NABLA_LOAD_CORE ON) set(Nabla_Core_FOUND TRUE) + elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInterop") + set(_NBL_NABLA_LOAD_CORE ON) + set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) + set(Nabla_CUDAInterop_FOUND TRUE) else() set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE) endif() @@ -80,6 +85,34 @@ if(_NBL_NABLA_LOAD_NSC) endif() endif() +if(_NBL_NABLA_LOAD_CUDA_INTEROP) + include(CMakeFindDependencyMacro) + + if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "") + set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}") + endif() + + find_dependency(CUDAToolkit REQUIRED) + if(CUDAToolkit_VERSION VERSION_LESS "13.0") + set(Nabla_CUDAInterop_FOUND FALSE) + if(Nabla_FIND_REQUIRED_CUDAInterop) + message(FATAL_ERROR "Nabla: CUDAInterop requires CUDA Toolkit 13.0 or newer. Set Nabla_CUDA_TOOLKIT_ROOT or CUDAToolkit_ROOT if multiple CUDA Toolkit installs are present.") + endif() + else() + _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) + if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) + set(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) + foreach(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR IN LISTS CUDAToolkit_INCLUDE_DIRS) + if(EXISTS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl") + list(APPEND _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl") + endif() + endforeach() + list(REMOVE_DUPLICATES _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS) + target_include_directories(Nabla::ext::CUDAInterop INTERFACE ${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS}) + endif() + endif() +endif() + check_required_components(Nabla) # diff --git a/cmake/common.cmake b/cmake/common.cmake index c50e1f6fb2..ae2264fda4 100755 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -284,9 +284,22 @@ function(nbl_install_dir _DIR) endfunction() function(nbl_install_lib_spec _TARGETS _RELATIVE_DESTINATION) - install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries) - install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries) - install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries) + cmake_parse_arguments(_NBL_INSTALL_LIB "" "EXPORT" "" ${ARGN}) + if(_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "Unexpected arguments for nbl_install_lib_spec: ${_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS}") + endif() + + if(_NBL_INSTALL_LIB_EXPORT) + install(TARGETS ${_TARGETS} + EXPORT ${_NBL_INSTALL_LIB_EXPORT} + ARCHIVE DESTINATION ${_NBL_CPACK_PACKAGE_RELATIVE_ENTRY_}/lib/${_RELATIVE_DESTINATION} + COMPONENT Libraries + ) + else() + install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries) + install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries) + install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries) + endif() endfunction() function(nbl_install_lib _TARGETS) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index bb96bdfc80..6c3ab2606d 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -778,11 +778,28 @@ if(NBL_PCH) ) endif() -# extensions start_tracking_variables_for_propagation_to_parent() add_subdirectory(ext EXCLUDE_FROM_ALL) propagate_changed_variables_to_parent_scope() +if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB}) + set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF) + + set(_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS) + if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) + list(APPEND _NBL_EXT_CUDA_INTEROP_INSTALL_ARGS EXPORT NablaCUDAInteropExportTargets) + endif() + nbl_install_lib_spec(${NBL_EXT_CUDA_INTEROP_LIB} "nbl/ext/CUDA_INTEROP" ${_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS}) + + if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) + install(EXPORT NablaCUDAInteropExportTargets + NAMESPACE Nabla:: + DESTINATION cmake + COMPONENT Libraries + ) + endif() +endif() + if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB}) set_target_properties(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF) nbl_install_lib_spec(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} "nbl/ext/FULL_SCREEN_TRIANGLE") diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt index d3f8e85169..93b6bef8c1 100644 --- a/src/nbl/ext/CUDAInterop/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -39,7 +39,10 @@ if (NBL_COMPILE_WITH_CUDA) list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS) - target_include_directories(${LIB_NAME} BEFORE PUBLIC ${NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS}) + foreach(_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR IN LISTS NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS) + target_include_directories(${LIB_NAME} BEFORE PUBLIC $) + endforeach() + set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop") add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME}) endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index 23dd6d5422..cd9ba7b70e 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.30) project(NblExtCUDAInteropSmoke CXX) if(NOT TARGET Nabla::Nabla) - find_package(Nabla REQUIRED CONFIG) + find_package(Nabla REQUIRED CONFIG COMPONENTS Core CUDAInterop) endif() enable_testing() From a520d57a443c421d41e5f72c14cec70d29d6f175 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 13:42:37 +0200 Subject: [PATCH 089/149] Use CUDAToolkit package targets --- cmake/NablaConfig.cmake.in | 9 +-------- src/nbl/ext/CUDAInterop/CMakeLists.txt | 14 +------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index e88a25b0dd..ca32518244 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -101,14 +101,7 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP) else() _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) - set(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) - foreach(_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR IN LISTS CUDAToolkit_INCLUDE_DIRS) - if(EXISTS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl") - list(APPEND _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS "${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIR}/cccl") - endif() - endforeach() - list(REMOVE_DUPLICATES _NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS) - target_include_directories(Nabla::ext::CUDAInterop INTERFACE ${_NBL_NABLA_CUDA_INTEROP_INCLUDE_DIRS}) + target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) endif() endif() endif() diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt index 93b6bef8c1..7a69e62ad4 100644 --- a/src/nbl/ext/CUDAInterop/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -29,19 +29,7 @@ if (NBL_COMPILE_WITH_CUDA) "_NBL_COMPILE_WITH_CUDA_" ) - set(NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS) - if(CUDAToolkit_ROOT) - list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS - "${CUDAToolkit_ROOT}/include" - "${CUDAToolkit_ROOT}/include/cccl" - ) - endif() - list(APPEND NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS}) - list(REMOVE_DUPLICATES NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS) - - foreach(_NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIR IN LISTS NBL_EXT_CUDA_INTEROP_CUDA_INCLUDE_DIRS) - target_include_directories(${LIB_NAME} BEFORE PUBLIC $) - endforeach() + target_link_libraries(${LIB_NAME} PUBLIC $) set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop") add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME}) endif() From 4bddc571ade70a289036d87772a85b35870c5307 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 14:29:27 +0200 Subject: [PATCH 090/149] Require CUDA version via CMake --- CMakeLists.txt | 8 ++------ cmake/NablaConfig.cmake.in | 15 ++++----------- .../ext/CUDAInterop/smoke/public_boundary.cpp | 18 ++++++++++++++++++ 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5e1bfac20..14845789fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,12 +77,8 @@ if(NBL_COMPILE_WITH_CUDA) if(NBL_CUDA_TOOLKIT_ROOT) set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}") endif() - find_package(CUDAToolkit REQUIRED) - if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") - message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") - else() - message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!") - endif() + find_package(CUDAToolkit 13.0 REQUIRED) + message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") endif() get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index ca32518244..8b9f62e548 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -92,17 +92,10 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP) set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}") endif() - find_dependency(CUDAToolkit REQUIRED) - if(CUDAToolkit_VERSION VERSION_LESS "13.0") - set(Nabla_CUDAInterop_FOUND FALSE) - if(Nabla_FIND_REQUIRED_CUDAInterop) - message(FATAL_ERROR "Nabla: CUDAInterop requires CUDA Toolkit 13.0 or newer. Set Nabla_CUDA_TOOLKIT_ROOT or CUDAToolkit_ROOT if multiple CUDA Toolkit installs are present.") - endif() - else() - _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) - if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) - target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) - endif() + find_dependency(CUDAToolkit 13.0 REQUIRED) + _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) + if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) + target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) endif() endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp index 4f6cbebfb1..eb7061f0ee 100644 --- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -1,5 +1,23 @@ #include "nabla.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla consumers must not include CUDA SDK headers." +#endif + #include "nbl/system/IApplicationFramework.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla consumers must not include CUDA SDK headers." +#endif + #include "nbl/ext/CUDAInterop/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ From 6f68e6644eb222cc5c6a875a8a85e97650261537 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 16:17:15 +0200 Subject: [PATCH 091/149] Split CUDA interop native surface --- cmake/NablaConfig.cmake.in | 17 +- examples_tests | 2 +- include/nbl/ext/CUDAInterop/CCUDADevice.h | 43 +-- .../ext/CUDAInterop/CCUDAExportableMemory.h | 88 +++--- include/nbl/ext/CUDAInterop/CCUDAHandler.h | 274 ++---------------- .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h | 41 +-- .../ext/CUDAInterop/CCUDAImportedSemaphore.h | 47 ++- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 211 ++++++++++++++ src/nbl/CMakeLists.txt | 14 + src/nbl/ext/CMakeLists.txt | 4 + src/nbl/ext/CUDAInterop/CCUDADevice.cpp | 85 ++++-- .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 34 ++- src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 161 ++++++++-- .../ext/CUDAInterop/CCUDAImportedMemory.cpp | 32 +- .../CUDAInterop/CCUDAImportedSemaphore.cpp | 24 +- src/nbl/ext/CUDAInterop/CMakeLists.txt | 17 +- .../CUDAInterop/CUDAInteropNativeState.hpp | 106 +++++++ src/nbl/ext/CUDAInterop/README.md | 23 ++ src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 17 +- .../ext/CUDAInterop/smoke/clean_opt_in.cpp | 42 +++ .../smoke/{opt_in.cpp => native_opt_in.cpp} | 25 +- 21 files changed, 817 insertions(+), 490 deletions(-) create mode 100644 include/nbl/ext/CUDAInterop/CUDAInteropNative.h create mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp create mode 100644 src/nbl/ext/CUDAInterop/README.md create mode 100644 src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp rename src/nbl/ext/CUDAInterop/smoke/{opt_in.cpp => native_opt_in.cpp} (72%) diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index 8b9f62e548..afff3dcccc 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -7,6 +7,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso set(_NBL_NABLA_LOAD_CORE OFF) set(_NBL_NABLA_LOAD_NSC OFF) set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF) +set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE OFF) set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS}) set(_NBL_NABLA_HAS_CORE_EXPORTS OFF) set(_NBL_NABLA_HAS_NSC_EXPORTS OFF) @@ -30,6 +31,12 @@ if(_NBL_NABLA_COMPONENTS) set(_NBL_NABLA_LOAD_CORE ON) set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) set(Nabla_CUDAInterop_FOUND TRUE) + elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInteropNative") + set(_NBL_NABLA_LOAD_CORE ON) + set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) + set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE ON) + set(Nabla_CUDAInterop_FOUND TRUE) + set(Nabla_CUDAInteropNative_FOUND TRUE) else() set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE) endif() @@ -86,6 +93,10 @@ if(_NBL_NABLA_LOAD_NSC) endif() if(_NBL_NABLA_LOAD_CUDA_INTEROP) + _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) +endif() + +if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE) include(CMakeFindDependencyMacro) if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "") @@ -93,9 +104,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP) endif() find_dependency(CUDAToolkit 13.0 REQUIRED) - _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) - if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) - target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) + _nbl_try_include_component("CUDAInteropNative" "NablaCUDAInteropNativeExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND) + if(_NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND AND TARGET Nabla::ext::CUDAInteropNative) + target_link_libraries(Nabla::ext::CUDAInteropNative INTERFACE CUDA::toolkit) endif() endif() diff --git a/examples_tests b/examples_tests index 5c604d274b..7a2a4f604f 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 5c604d274b8aac99d8855f5b7aaf615910c8a5f6 +Subproject commit 7a2a4f604fd941984d6624e3059f7380cc6592a2 diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h index d7886a4c53..25c40e7ed6 100644 --- a/include/nbl/ext/CUDAInterop/CCUDADevice.h +++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h @@ -4,37 +4,32 @@ #ifndef _NBL_VIDEO_C_CUDA_DEVICE_H_ #define _NBL_VIDEO_C_CUDA_DEVICE_H_ - -#ifdef _NBL_COMPILE_WITH_CUDA_ - #include "nbl/video/declarations.h" #include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h" #include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" #include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include +#include namespace nbl::video { class CCUDAHandler; +namespace cuda_native +{ +struct SAccess; +} + class CCUDADevice : public core::IReferenceCounted { - public: + public: + struct SNativeState; #ifdef _WIN32 static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; - static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; #else static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; - static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif enum E_VIRTUAL_ARCHITECTURE @@ -73,22 +68,20 @@ class CCUDADevice : public core::IReferenceCounted }; inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} - CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler); + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); - ~CCUDADevice(); + ~CCUDADevice() override; inline core::SRange geDefaultCompileOptions() const { return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; } - CUdevice getInternalObject() const { return m_handle; } - const CCUDAHandler* getHandler() const { return m_handler.get(); } bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } - size_t roundToGranularity(CUmemLocationType location, size_t size) const; + size_t roundToGranularity(ECUDAMemoryLocation location, size_t size) const; core::smart_refctd_ptr createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); @@ -97,24 +90,20 @@ class CCUDADevice : public core::IReferenceCounted core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); private: - CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; + friend struct cuda_native::SAccess; static constexpr auto CudaMemoryLocationCount = 5; - const system::logger_opt_ptr m_logger; + const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; IPhysicalDevice* const m_physicalDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; core::smart_refctd_ptr m_handler; - CUdevice m_handle; - CUcontext m_context; - std::array m_allocationGranularity; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h index 10bf911717..5973c31fac 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h @@ -4,64 +4,60 @@ #ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ #define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ - -#ifdef _NBL_COMPILE_WITH_CUDA_ - #include "nbl/video/declarations.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include namespace nbl::video { - class CCUDADevice; -class CCUDAExportableMemory : public core::IReferenceCounted +namespace cuda_native { - public: - - struct SCreationParams - { - size_t size; - uint32_t alignment; - CUmemLocationType location; - }; - - struct SCachedCreationParams : SCreationParams - { - size_t granularSize; - CUdeviceptr ptr; - external_handle_t externalHandle; - }; - - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) - : m_device(std::move(device)) - , m_params(std::move(params)) - {} - ~CCUDAExportableMemory() override; - - CUdeviceptr getDeviceptr() const { return m_params.ptr; } - - const SCreationParams& getCreationParams() const { return m_params; } - - core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; +struct SAccess; +} - private: +enum class ECUDAMemoryLocation : uint32_t +{ + DEVICE = 1, + HOST = 2, + HOST_NUMA = 3, + HOST_NUMA_CURRENT = 4 +}; - core::smart_refctd_ptr m_device; - SCachedCreationParams m_params; +class CCUDAExportableMemory : public core::IReferenceCounted +{ + public: + struct SNativeState; + struct SCreationParams + { + size_t size; + uint32_t alignment; + ECUDAMemoryLocation location; + }; + + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + external_handle_t externalHandle; + }; + + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + ~CCUDAExportableMemory() override; + + const SCreationParams& getCreationParams() const { return m_params; } + + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + private: + friend struct cuda_native::SAccess; + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h index 5128aad575..063598a518 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h +++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h @@ -9,158 +9,30 @@ #include "nbl/system/declarations.h" -#include "nbl/ext/CUDAInterop/CCUDADevice.h" +#include +#include +#include +#include - -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { +class CCUDADevice; +class CVulkanConnection; +class IPhysicalDevice; +namespace cuda_native +{ +struct SAccess; +} class CCUDAHandler : public core::IReferenceCounted { - public: - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); - - inline bool defaultHandleResult(CUresult result) const - { - core::smart_refctd_ptr logger = m_logger.get(); - return defaultHandleResult(result,logger.get()); - } - - // - bool defaultHandleResult(nvrtcResult result); - - // - template - static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } - - // + public: + struct SNativeState; static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); - // - using LibLoader = system::DefaultFuncPtrLoader; - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v4 - ,cuDevicePrimaryCtxRetain - ,cuDevicePrimaryCtxRelease - ,cuDevicePrimaryCtxSetFlags - ,cuDevicePrimaryCtxGetState - ,cuCtxDestroy_v2 - ,cuCtxEnablePeerAccess - ,cuCtxGetApiVersion - ,cuCtxGetCurrent - ,cuCtxGetDevice - ,cuCtxGetSharedMemConfig - ,cuCtxPopCurrent_v2 - ,cuCtxPushCurrent_v2 - ,cuCtxSetCacheConfig - ,cuCtxSetCurrent - ,cuCtxSetSharedMemConfig - ,cuCtxSynchronize - ,cuDeviceComputeCapability - ,cuDeviceCanAccessPeer - ,cuDeviceGetCount - ,cuDeviceGet - ,cuDeviceGetAttribute - ,cuDeviceGetLuid - ,cuDeviceGetUuid_v2 - ,cuDeviceTotalMem_v2 - ,cuDeviceGetName - ,cuDriverGetVersion - ,cuEventCreate - ,cuEventDestroy_v2 - ,cuEventElapsedTime - ,cuEventQuery - ,cuEventRecord - ,cuEventSynchronize - ,cuFuncGetAttribute - ,cuFuncSetCacheConfig - ,cuGetErrorName - ,cuGetErrorString - ,cuGraphicsMapResources - ,cuGraphicsResourceGetMappedPointer_v2 - ,cuGraphicsResourceGetMappedMipmappedArray - ,cuGraphicsSubResourceGetMappedArray - ,cuGraphicsUnmapResources - ,cuGraphicsUnregisterResource - ,cuInit - ,cuLaunchKernel - ,cuMemAlloc_v2 - ,cuMemcpyDtoD_v2 - ,cuMemcpyDtoH_v2 - ,cuMemcpyHtoD_v2 - ,cuMemcpyDtoDAsync_v2 - ,cuMemcpyDtoHAsync_v2 - ,cuMemcpyHtoDAsync_v2 - ,cuMemGetAddressRange_v2 - ,cuMemFree_v2 - ,cuMemFreeHost - ,cuMemGetInfo_v2 - ,cuMemHostAlloc - ,cuMemHostRegister_v2 - ,cuMemHostUnregister - ,cuMemsetD32_v2 - ,cuMemsetD32Async - ,cuMemsetD8_v2 - ,cuMemsetD8Async - ,cuModuleGetFunction - ,cuModuleGetGlobal_v2 - ,cuModuleLoadDataEx - ,cuModuleLoadFatBinary - ,cuModuleUnload - ,cuOccupancyMaxActiveBlocksPerMultiprocessor - ,cuPointerGetAttribute - ,cuStreamAddCallback - ,cuStreamCreate - ,cuStreamDestroy_v2 - ,cuStreamQuery - ,cuStreamSynchronize - ,cuStreamWaitEvent - ,cuSurfObjectCreate - ,cuSurfObjectDestroy - ,cuTexObjectCreate - ,cuTexObjectDestroy - ,cuImportExternalMemory - ,cuDestroyExternalMemory - ,cuExternalMemoryGetMappedBuffer - ,cuMemUnmap - ,cuMemAddressFree - ,cuMemGetAllocationGranularity - ,cuMemAddressReserve - ,cuMemCreate - ,cuMemExportToShareableHandle - ,cuMemMap - ,cuMemRelease - ,cuMemSetAccess - ,cuMemImportFromShareableHandle - ,cuLaunchHostFunc - ,cuDestroyExternalSemaphore - ,cuImportExternalSemaphore - ,cuSignalExternalSemaphoresAsync - ,cuWaitExternalSemaphoresAsync - ,cuLogsRegisterCallback - ); - const CUDA& getCUDAFunctionTable() const {return m_cuda;} - - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, - nvrtcGetErrorString, - nvrtcVersion, - nvrtcAddNameExpression, - nvrtcCompileProgram, - nvrtcCreateProgram, - nvrtcDestroyProgram, - nvrtcGetLoweredName, - nvrtcGetPTX, - nvrtcGetPTXSize, - nvrtcGetProgramLog, - nvrtcGetProgramLogSize - ); - const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;} + CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - - // inline core::SRange getSTDHeaders() { auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get()); @@ -169,29 +41,9 @@ class CCUDAHandler : public core::IReferenceCounted inline const auto& getSTDHeaderContents() { return m_headerContents; } inline const auto& getSTDHeaderNames() { return m_headerNames; } - // - nvrtcResult createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - inline nvrtcResult createProgram(nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - return createProgram(prog,std::string(source),name,headerCount,headerContents,includeNames); - } - inline nvrtcResult createProgram(nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - const auto filesize = file->getSize(); - std::string source(filesize+1u,'0'); - - system::IFile::success_t bytesRead; - file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.getBytesProcessed()); - - return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); - } - struct SCUDADeviceInfo { - CUdevice handle = {}; - CUuuid uuid = {}; - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + std::array uuid = {}; }; inline core::vector const& getAvailableDevices() const @@ -199,93 +51,15 @@ class CCUDAHandler : public core::IReferenceCounted return m_availableDevices; } - // - inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) - { - return m_nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); - } - - // - nvrtcResult getProgramLog(nvrtcProgram prog, std::string& log); - - // - struct ptx_and_nvrtcResult_t - { - core::smart_refctd_ptr ptx; - nvrtcResult result; - }; - ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog); - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - - result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - system::IFile* file, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - - result = createProgram(&program,file,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); - } - core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: + ~CCUDAHandler() override; - ~CCUDAHandler() = default; - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) - { - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - result = compileProgram(program,nvrtcOptions); - if (log) - getProgramLog(program,*log); - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - return getPTX(program); - } - - // function tables - CUDA m_cuda; - NVRTC m_nvrtc; + private: + friend struct cuda_native::SAccess; - // + std::unique_ptr m_native; core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; @@ -295,16 +69,6 @@ class CCUDAHandler : public core::IReferenceCounted int m_version; }; -#define ASSERT_CUDA_SUCCESS(expr, handler) \ - do { \ - const auto cudaResult = (expr); \ - if (!((handler)->defaultHandleResult(cudaResult))) { \ - assert(false); \ - } \ - } while(0) - } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h index 5f885abd2d..8a24f83907 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h @@ -1,46 +1,37 @@ #ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_ #define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_ -#ifdef _NBL_COMPILE_WITH_CUDA_ - #include "nbl/video/declarations.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif +#include +#include namespace nbl::video { class CCUDADevice; -class CCUDAImportedMemory : public core::IReferenceCounted +namespace cuda_native { - public: - - CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, - CUexternalMemory cuExtMem) : - m_device(device), - m_src(src), - m_handle(cuExtMem) {} - - ~CCUDAImportedMemory() override; +struct SAccess; +} - CUexternalMemory getInternalObject() const { return m_handle; } - CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); +class CCUDAImportedMemory : public core::IReferenceCounted +{ + public: + struct SNativeState; + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); - private: + ~CCUDAImportedMemory() override; - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalMemory m_handle; + private: + friend struct cuda_native::SAccess; + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h index 409ef1a676..3ee03fb045 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h @@ -4,47 +4,36 @@ #ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ #define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ -#ifdef _NBL_COMPILE_WITH_CUDA_ - #include "nbl/video/declarations.h" -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include namespace nbl::video { class CCUDADevice; +namespace cuda_native +{ +struct SAccess; +} + class CCUDAImportedSemaphore : public core::IReferenceCounted { - public: - - CUexternalSemaphore getInternalObject() const { return m_handle; } - CCUDAImportedSemaphore(core::smart_refctd_ptr device, - core::smart_refctd_ptr src, - CUexternalSemaphore semaphore) - : m_device(std::move(device)) - , m_src(std::move(src)) - , m_handle(semaphore) - {} - ~CCUDAImportedSemaphore() override; - - private: - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalSemaphore m_handle; + public: + struct SNativeState; + CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + ~CCUDAImportedSemaphore() override; + + private: + friend struct cuda_native::SAccess; + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h new file mode 100644 index 0000000000..f913664122 --- /dev/null +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -0,0 +1,211 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ +#define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ + +#include "nbl/ext/CUDAInterop/CUDAInterop.h" + +#include "nbl/asset/ICPUBuffer.h" +#include "nbl/system/DynamicFunctionCaller.h" + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 13000 + #error "Need CUDA 13.0 SDK or higher." +#endif + +namespace nbl::video::cuda_native +{ + +using LibLoader = system::DefaultFuncPtrLoader; + +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader + ,cuCtxCreate_v4 + ,cuDevicePrimaryCtxRetain + ,cuDevicePrimaryCtxRelease + ,cuDevicePrimaryCtxSetFlags + ,cuDevicePrimaryCtxGetState + ,cuCtxDestroy_v2 + ,cuCtxEnablePeerAccess + ,cuCtxGetApiVersion + ,cuCtxGetCurrent + ,cuCtxGetDevice + ,cuCtxGetSharedMemConfig + ,cuCtxPopCurrent_v2 + ,cuCtxPushCurrent_v2 + ,cuCtxSetCacheConfig + ,cuCtxSetCurrent + ,cuCtxSetSharedMemConfig + ,cuCtxSynchronize + ,cuDeviceComputeCapability + ,cuDeviceCanAccessPeer + ,cuDeviceGetCount + ,cuDeviceGet + ,cuDeviceGetAttribute + ,cuDeviceGetLuid + ,cuDeviceGetUuid_v2 + ,cuDeviceTotalMem_v2 + ,cuDeviceGetName + ,cuDriverGetVersion + ,cuEventCreate + ,cuEventDestroy_v2 + ,cuEventElapsedTime + ,cuEventQuery + ,cuEventRecord + ,cuEventSynchronize + ,cuFuncGetAttribute + ,cuFuncSetCacheConfig + ,cuGetErrorName + ,cuGetErrorString + ,cuGraphicsMapResources + ,cuGraphicsResourceGetMappedPointer_v2 + ,cuGraphicsResourceGetMappedMipmappedArray + ,cuGraphicsSubResourceGetMappedArray + ,cuGraphicsUnmapResources + ,cuGraphicsUnregisterResource + ,cuInit + ,cuLaunchKernel + ,cuMemAlloc_v2 + ,cuMemcpyDtoD_v2 + ,cuMemcpyDtoH_v2 + ,cuMemcpyHtoD_v2 + ,cuMemcpyDtoDAsync_v2 + ,cuMemcpyDtoHAsync_v2 + ,cuMemcpyHtoDAsync_v2 + ,cuMemGetAddressRange_v2 + ,cuMemFree_v2 + ,cuMemFreeHost + ,cuMemGetInfo_v2 + ,cuMemHostAlloc + ,cuMemHostRegister_v2 + ,cuMemHostUnregister + ,cuMemsetD32_v2 + ,cuMemsetD32Async + ,cuMemsetD8_v2 + ,cuMemsetD8Async + ,cuModuleGetFunction + ,cuModuleGetGlobal_v2 + ,cuModuleLoadDataEx + ,cuModuleLoadFatBinary + ,cuModuleUnload + ,cuOccupancyMaxActiveBlocksPerMultiprocessor + ,cuPointerGetAttribute + ,cuStreamAddCallback + ,cuStreamCreate + ,cuStreamDestroy_v2 + ,cuStreamQuery + ,cuStreamSynchronize + ,cuStreamWaitEvent + ,cuSurfObjectCreate + ,cuSurfObjectDestroy + ,cuTexObjectCreate + ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback +); + +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, + nvrtcGetErrorString, + nvrtcVersion, + nvrtcAddNameExpression, + nvrtcCompileProgram, + nvrtcCreateProgram, + nvrtcDestroyProgram, + nvrtcGetLoweredName, + nvrtcGetPTX, + nvrtcGetPTXSize, + nvrtcGetProgramLog, + nvrtcGetProgramLogSize +); + +struct SCUDADeviceInfo +{ + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; +}; + +const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); +const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); + +bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); +bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); +bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); + +template +T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } + +const core::vector& getAvailableDevices(const CCUDAHandler& handler); + +nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); +inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) +{ + return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames); +} +nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); +nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); +nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); + +struct ptx_and_nvrtcResult_t +{ + core::smart_refctd_ptr ptx; + nvrtcResult result; +}; + +ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog); +ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +); +inline ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler& handler, const char* source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +) +{ + return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); +} +ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler& handler, system::IFile* file, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +); + +CUdevice getInternalObject(const CCUDADevice& device); +CUcontext getContext(const CCUDADevice& device); +size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); +CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); +CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); +CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); +CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); + +} + +#define ASSERT_CUDA_SUCCESS(expr, handler) \ + do { \ + const auto cudaResult = (expr); \ + if (!nbl::video::cuda_native::defaultHandleResult(*(handler), cudaResult)) { \ + assert(false); \ + } \ + } while(0) + +#endif diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 6c3ab2606d..ecf7f555c3 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -798,6 +798,20 @@ if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB}) COMPONENT Libraries ) endif() + + if(DEFINED NBL_EXT_CUDA_INTEROP_NATIVE_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}) + if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) + install(TARGETS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} + EXPORT NablaCUDAInteropNativeExportTargets + COMPONENT Libraries + ) + install(EXPORT NablaCUDAInteropNativeExportTargets + NAMESPACE Nabla:: + DESTINATION cmake + COMPONENT Libraries + ) + endif() + endif() endif() if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB}) diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt index 1f815413e8..59ae49285e 100644 --- a/src/nbl/ext/CMakeLists.txt +++ b/src/nbl/ext/CMakeLists.txt @@ -48,6 +48,10 @@ if (NBL_COMPILE_WITH_CUDA) ${NBL_EXT_CUDA_INTEROP_LIB} PARENT_SCOPE ) + set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB + ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} + PARENT_SCOPE + ) endif() if (NBL_BUILD_IMGUI) diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp index aa06c6e7bf..5f59545173 100644 --- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp @@ -1,15 +1,12 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CCUDADevice.h" -#include "nbl/ext/CUDAInterop/CCUDAHandler.h" +#include "CUDAInteropNativeState.hpp" #ifdef _WIN32 #include #endif -#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" - #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { @@ -18,28 +15,27 @@ CCUDADevice::CCUDADevice( core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, - CUdevice device, + std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler) : m_logger(vulkanDevice->getDebugCallback()->getLogger()), m_defaultCompileOptions(), m_vulkanConnection(std::move(vulkanConnection)), m_physicalDevice(vulkanDevice), m_virtualArchitecture(virtualArchitecture), - m_handle(device), m_handler(std::move(handler)), - m_allocationGranularity{} + m_native(std::move(nativeState)) { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - const auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_native->context), m_handler); - for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) + for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType) { #ifdef _WIN32 @@ -50,24 +46,47 @@ CCUDADevice::CCUDADevice( const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = static_cast(locationType), .id = m_handle }, + .requestedHandleTypes = cuda_native::getAllocationHandleType(), + .location = { .type = static_cast(locationType), .id = m_native->handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); } } -size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const +size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation location, size_t size) const +{ + return cuda_native::roundToGranularity(*this,cuda_native::toNative(location),size); +} + +namespace cuda_native +{ + +CUdevice getInternalObject(const CCUDADevice& device) +{ + return SAccess::native(device).handle; +} + +CUcontext getContext(const CCUDADevice& device) { - return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; + return SAccess::native(device).context; +} + +size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size) +{ + const auto& granularity = SAccess::native(device).allocationGranularity[location]; + return ((size - 1) / granularity + 1) * granularity; +} + } -CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const +static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) { - const auto& cu = m_handler->getCUDAFunctionTable(); + const auto handler = device.getHandler(); + const auto& native = cuda_native::SAccess::native(device); + const auto& cu = cuda_native::getCUDAFunctionTable(*handler); CUdeviceptr ptr = 0; if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) @@ -75,19 +94,19 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler); return err; } CUmemAccessDesc accessDesc = { - .location = { .type = location, .id = m_handle }, + .location = { .type = location, .id = native.handle }, .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, }; if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler); return err; } @@ -100,7 +119,8 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor { CCUDAExportableMemory::SCachedCreationParams params = { inParams }; - auto& cu = m_handler->getCUDAFunctionTable(); + auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); + const auto nativeLocation = cuda_native::toNative(params.location); #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { @@ -110,14 +130,15 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = params.location, .id = m_handle }, + .requestedHandleTypes = cuda_native::getAllocationHandleType(), + .location = { .type = nativeLocation, .id = m_native->handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; params.granularSize = roundToGranularity(params.location, params.size); + auto nativeState = std::make_unique(); CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) @@ -133,7 +154,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor return nullptr; } - if (const auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(*this,&nativeState->ptr, params.granularSize, params.alignment, nativeLocation, mem); CUDA_SUCCESS != err) { m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); @@ -152,12 +173,12 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params)); + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), std::move(nativeState)); } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - const auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); const auto handleType = mem->getCreationParams().externalHandleType; if (!handleType) return nullptr; @@ -180,12 +201,12 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), std::make_unique(cuExtMem)); } core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - auto& cu = m_handler->getCUDAFunctionTable(); + auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); auto handleType = sema->getCreationParams().externalHandleTypes.value; if (!handleType) @@ -210,12 +231,12 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), std::make_unique(cusema)); } CCUDADevice::~CCUDADevice() { - ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler); + ASSERT_CUDA_SUCCESS(cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context), m_handler); } } diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp index 65afdca660..94d18c40bb 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp @@ -2,14 +2,18 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h" -#include "nbl/ext/CUDAInterop/CCUDADevice.h" -#include "nbl/ext/CUDAInterop/CCUDAHandler.h" +#include "CUDAInteropNativeState.hpp" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{} + core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { auto pd = device->getPhysicalDevice(); @@ -18,10 +22,10 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM switch (m_params.location) { - case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; - case CU_MEM_LOCATION_TYPE_HOST_NUMA: - case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: - case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; + case ECUDAMemoryLocation::DEVICE: memoryTypeBits &= vram; break; + case ECUDAMemoryLocation::HOST_NUMA: + case ECUDAMemoryLocation::HOST_NUMA_CURRENT: + case ECUDAMemoryLocation::HOST: memoryTypeBits &= ~vram; break; default: break; } @@ -40,15 +44,25 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM CCUDAExportableMemory::~CCUDAExportableMemory() { - const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler()); + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize), m_device->getHandler()); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler()); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize), m_device->getHandler()); bool closeSucceed = CloseExternalHandle(m_params.externalHandle); assert(closeSucceed); +} + +namespace cuda_native +{ + +CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory) +{ + return SAccess::native(memory).ptr; +} + } } diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index 748a88d1a1..49e36083d4 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -2,7 +2,7 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CCUDAHandler.h" +#include "CUDAInteropNativeState.hpp" #include "nbl/system/CFileView.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -13,13 +13,11 @@ namespace nbl::video { CCUDAHandler::CCUDAHandler( - CUDA&& _cuda, - NVRTC&& _nvrtc, + std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)) - , m_nvrtc(std::move(_nvrtc)) + : m_native(std::move(nativeState)) , m_headers(std::move(_headers)) , m_logger(std::move(_logger)) , m_version(_version) @@ -32,29 +30,38 @@ CCUDAHandler::CCUDAHandler( } int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) return; for (int device_i = 0; device_i < deviceCount; device_i++) { CUdevice handle = -1; - if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) + if (m_native->cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) continue; CUuuid uuid = {}; - if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) + if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) continue; - m_availableDevices.emplace_back(handle, uuid); + auto& nativeDevice = m_native->availableDevices.emplace_back(); + nativeDevice.handle = handle; + nativeDevice.uuid = uuid; + auto& cleanDevice = m_availableDevices.emplace_back(); + memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size()); - int* attributes = m_availableDevices.back().attributes; + int* attributes = nativeDevice.attributes; for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) - m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + m_native->cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); } } -bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) +CCUDAHandler::~CCUDAHandler() = default; + +namespace cuda_native +{ + +bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) { @@ -420,7 +427,12 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt return false; } -bool CCUDAHandler::defaultHandleResult(nvrtcResult result) +bool defaultHandleResult(const CCUDAHandler& handler, CUresult result) +{ + return defaultHandleResult(result,SAccess::logger(handler)); +} + +bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) { switch (result) { @@ -428,19 +440,21 @@ bool CCUDAHandler::defaultHandleResult(nvrtcResult result) return true; break; default: - if (m_nvrtc.pnvrtcGetErrorString) - m_logger.log("%s\n",system::ILogger::ELL_ERROR,m_nvrtc.pnvrtcGetErrorString(result)); + if (SAccess::native(handler).nvrtc.pnvrtcGetErrorString) + SAccess::logger(handler).log("%s\n",system::ILogger::ELL_ERROR,SAccess::native(handler).nvrtc.pnvrtcGetErrorString(result)); else - m_logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); + SAccess::logger(handler).log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); break; } _NBL_DEBUG_BREAK_IF(true); return false; } +} + core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr&& _logger) { - CUDA cuda = CUDA( + cuda_native::CUDA cuda = cuda_native::CUDA( #if defined(_NBL_WINDOWS_API_) "nvcuda" #elif defined(_NBL_POSIX_API_) @@ -450,7 +464,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste #endif ); - NVRTC nvrtc = {}; + cuda_native::NVRTC nvrtc = {}; #if defined(_NBL_WINDOWS_API_) // Perpetual TODO: any new CUDA releases we need to account for? // Version List: https://developer.nvidia.com/cuda-toolkit-archive @@ -468,7 +482,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { std::string path(*verpath); path += *suffix; - nvrtc = NVRTC(path.c_str()); + nvrtc = cuda_native::NVRTC(path.c_str()); if (nvrtc.pnvrtcVersion) break; } @@ -476,7 +490,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste break; } #elif defined(_NBL_POSIX_API_) - nvrtc = NVRTC("nvrtc"); + nvrtc = cuda_native::NVRTC("nvrtc"); //nvrtc_builtins = NVRTC("nvrtc-builtins"); #else #error "Unsuported Platform" @@ -526,10 +540,28 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste )); } - return core::make_smart_refctd_ptr(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion); + return core::make_smart_refctd_ptr(std::make_unique(std::move(cuda),std::move(nvrtc)), std::move(headers), std::move(_logger), cudaVersion); +} + +namespace cuda_native +{ + +const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler) +{ + return SAccess::native(handler).cuda; +} + +const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler) +{ + return SAccess::native(handler).nvrtc; } -nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +const core::vector& getAvailableDevices(const CCUDAHandler& handler) +{ + return SAccess::native(handler).availableDevices; +} + +nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -538,26 +570,43 @@ nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source #else #error "Unsuported Platform" #endif - return m_nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); + return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); +} + +nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount, const char* const* headerContents, const char* const* includeNames) +{ + const auto filesize = file->getSize(); + std::string source(filesize+1u,'0'); + + system::IFile::success_t bytesRead; + file->read(bytesRead,source.data(),0u,file->getSize()); + source.resize(bytesRead.getBytesProcessed()); + + return createProgram(handler,prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); +} + +nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) +{ + return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); } -nvrtcResult CCUDAHandler::getProgramLog(nvrtcProgram prog, std::string& log) +nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetProgramLogSize(prog, &_size); + nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size); if (sizeRes != NVRTC_SUCCESS) return sizeRes; if (_size == 0ull) return NVRTC_ERROR_INVALID_INPUT; log.resize(_size); - return m_nvrtc.pnvrtcGetProgramLog(prog,log.data()); + return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data()); } -CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) +ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog) { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetPTXSize(prog,&_size); + nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size); if (sizeRes!=NVRTC_SUCCESS) return {nullptr,sizeRes}; if (_size==0ull) @@ -567,7 +616,57 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) ptxParams.size = _size; auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); auto ptxPtr = static_cast(ptx->getPointer()); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; + return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; +} + +static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) +{ + if (result!=NVRTC_SUCCESS) + return {nullptr,result}; + + result = compileProgram(handler,program,nvrtcOptions); + if (log) + getProgramLog(handler,program,*log); + if (result!=NVRTC_SUCCESS) + return {nullptr,result}; + + return getPTX(handler,program); +} + +ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + const int headerCount, const char* const* headerContents, const char* const* includeNames, + std::string* log) +{ + nvrtcProgram program = nullptr; + nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + auto cleanup = core::makeRAIIExiter([&]() -> void + { + if (result!=NVRTC_SUCCESS && program) + SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); + }); + + result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); + return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log); +} + +ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler& handler, system::IFile* file, core::SRange nvrtcOptions, + const int headerCount, const char* const* headerContents, const char* const* includeNames, + std::string* log) +{ + nvrtcProgram program = nullptr; + nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + auto cleanup = core::makeRAIIExiter([&]() -> void + { + if (result!=NVRTC_SUCCESS && program) + SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); + }); + + result = createProgram(handler,&program,file,headerCount,headerContents,includeNames); + return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log); +} + } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) @@ -578,7 +677,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - for (const auto& device : m_availableDevices) + for (const auto& device : m_native->availableDevices) { if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { @@ -662,7 +761,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)); + return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, std::make_unique(device.handle), core::smart_refctd_ptr(this)); } } return nullptr; diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp index a785bad9b9..bbc65f91ab 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp @@ -2,30 +2,44 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" -#include "nbl/ext/CUDAInterop/CCUDADevice.h" -#include "nbl/ext/CUDAInterop/CCUDAHandler.h" +#include "CUDAInteropNativeState.hpp" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { -CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{} + +namespace cuda_native +{ + +CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory) +{ + return SAccess::native(memory).handle; +} + +CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer) { CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; bufferDesc.offset = 0; - bufferDesc.size = m_src->getAllocationSize(); + bufferDesc.size = SAccess::source(memory)->getAllocationSize(); - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc); + const auto& cu = getCUDAFunctionTable(*SAccess::device(memory)->getHandler()); + return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc); } +} + CCUDAImportedMemory::~CCUDAImportedMemory() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler()); + auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_native->handle), m_device->getHandler()); } } diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp index 1ca4a34190..b6e3b319f7 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp @@ -2,17 +2,31 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h" -#include "nbl/ext/CUDAInterop/CCUDADevice.h" -#include "nbl/ext/CUDAInterop/CCUDAHandler.h" +#include "CUDAInteropNativeState.hpp" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{} + +namespace cuda_native +{ + +CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore) +{ + return SAccess::native(semaphore).handle; +} + +} + CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler()); + auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_native->handle), m_device->getHandler()); } } diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt index 7a69e62ad4..973fbb232a 100644 --- a/src/nbl/ext/CUDAInterop/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -5,6 +5,7 @@ if (NBL_COMPILE_WITH_CUDA) set(NBL_EXT_CUDA_INTEROP_H ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h + ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInteropNative.h ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h @@ -26,12 +27,24 @@ if (NBL_COMPILE_WITH_CUDA) "${NBL_EXT_CUDA_INTEROP_SRC}" "" "" - "_NBL_COMPILE_WITH_CUDA_" + "" ) - target_link_libraries(${LIB_NAME} PUBLIC $) + target_compile_definitions(${LIB_NAME} PRIVATE _NBL_COMPILE_WITH_CUDA_) + target_include_directories(${LIB_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop") add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME}) + + set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "NblExtCUDA_INTEROP_NATIVE") + add_library(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE) + target_link_libraries(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE + $ + $ + CUDA::toolkit + ) + set_target_properties(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInteropNative") + add_library(Nabla::ext::CUDAInteropNative ALIAS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}) + set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}" PARENT_SCOPE) endif() add_subdirectory(smoke) diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp new file mode 100644 index 0000000000..2dc3c3bbca --- /dev/null +++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp @@ -0,0 +1,106 @@ +#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ +#define _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ + +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" + +#include + +namespace nbl::video +{ + +struct CCUDAHandler::SNativeState +{ + cuda_native::CUDA cuda; + cuda_native::NVRTC nvrtc; + core::vector availableDevices; + + SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc) + : cuda(std::move(_cuda)) + , nvrtc(std::move(_nvrtc)) + {} +}; + +struct CCUDADevice::SNativeState +{ + CUdevice handle = {}; + CUcontext context = nullptr; + std::array allocationGranularity = {}; + + explicit SNativeState(CUdevice _handle) + : handle(_handle) + {} +}; + +struct CCUDAExportableMemory::SNativeState +{ + CUdeviceptr ptr = 0; +}; + +struct CCUDAImportedMemory::SNativeState +{ + CUexternalMemory handle = nullptr; + + explicit SNativeState(CUexternalMemory _handle) + : handle(_handle) + {} +}; + +struct CCUDAImportedSemaphore::SNativeState +{ + CUexternalSemaphore handle = nullptr; + + explicit SNativeState(CUexternalSemaphore _handle) + : handle(_handle) + {} +}; + +namespace cuda_native +{ + +inline CUmemLocationType toNative(ECUDAMemoryLocation location) +{ + return static_cast(static_cast(location)); +} + +inline ECUDAMemoryLocation toNabla(CUmemLocationType location) +{ + return static_cast(static_cast(location)); +} + +inline CUmemAllocationHandleType getAllocationHandleType() +{ +#ifdef _WIN32 + return CU_MEM_HANDLE_TYPE_WIN32; +#else + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif +} + +struct SAccess +{ + static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; } + static const CCUDAHandler::SNativeState& native(const CCUDAHandler& handler) { return *handler.m_native; } + + static CCUDADevice::SNativeState& native(CCUDADevice& device) { return *device.m_native; } + static const CCUDADevice::SNativeState& native(const CCUDADevice& device) { return *device.m_native; } + + static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; } + static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; } + + static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; } + static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; } + + static CCUDAImportedSemaphore::SNativeState& native(CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; } + static const CCUDAImportedSemaphore::SNativeState& native(const CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; } + + static system::logger_opt_ptr logger(const CCUDAHandler& handler) { return handler.m_logger.get().get(); } + static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; } + static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); } + static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); } +}; + +} + +} + +#endif diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md new file mode 100644 index 0000000000..1fd88d1b04 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/README.md @@ -0,0 +1,23 @@ +# CUDA Interop Targets + +- `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK. +- `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK. +- `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`. +- Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=`. +- A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target. +- Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`. + +```cmake +find_package(Nabla CONFIG REQUIRED) +target_link_libraries(app PRIVATE Nabla::Nabla) +``` + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop) +target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop) +``` + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInteropNative) +target_link_libraries(app PRIVATE Nabla::ext::CUDAInteropNative) +``` diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index cd9ba7b70e..71bdac260d 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -1,8 +1,14 @@ cmake_minimum_required(VERSION 3.30) project(NblExtCUDAInteropSmoke CXX) +option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF) + if(NOT TARGET Nabla::Nabla) - find_package(Nabla REQUIRED CONFIG COMPONENTS Core CUDAInterop) + set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core CUDAInterop) + if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE) + list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInteropNative) + endif() + find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS}) endif() enable_testing() @@ -18,6 +24,11 @@ nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary. target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla) if(TARGET Nabla::ext::CUDAInterop) - nbl_add_cuda_interop_smoke(NblExtCUDAInteropOptInSmoke opt_in.cpp) - target_link_libraries(NblExtCUDAInteropOptInSmoke PRIVATE Nabla::ext::CUDAInterop) + nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanOptInSmoke clean_opt_in.cpp) + target_link_libraries(NblExtCUDAInteropCleanOptInSmoke PRIVATE Nabla::ext::CUDAInterop) +endif() + +if(TARGET Nabla::ext::CUDAInteropNative) + nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp) + target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInteropNative) endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp new file mode 100644 index 0000000000..6952433f9e --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp @@ -0,0 +1,42 @@ +#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/system/IApplicationFramework.h" + +#include + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla::ext::CUDAInterop must not propagate the CUDA build define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla::ext::CUDAInterop must not require CUDA SDK headers." +#endif + +namespace +{ + +class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + static_assert(std::is_same_v); + + const nbl::video::CCUDAExportableMemory::SCreationParams params = { + .size = 4096, + .alignment = 4096, + .location = nbl::video::ECUDAMemoryLocation::DEVICE, + }; + return isAPILoaded() && params.location==nbl::video::ECUDAMemoryLocation::DEVICE; + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +} + +NBL_MAIN_FUNC(CUDAInteropCleanOptInSmoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp similarity index 72% rename from src/nbl/ext/CUDAInterop/smoke/opt_in.cpp rename to src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index bc8c8952bd..d868b2eaa7 100644 --- a/src/nbl/ext/CUDAInterop/smoke/opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -1,4 +1,4 @@ -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" #include "nbl/system/IApplicationFramework.h" #include @@ -7,8 +7,8 @@ #include #include -#ifndef _NBL_COMPILE_WITH_CUDA_ -#error "CUDA interop consumers must opt in through Nabla::ext::CUDAInterop." +#ifndef CUDA_VERSION +#error "Nabla::ext::CUDAInteropNative must expose CUDA SDK headers." #endif namespace @@ -25,7 +25,7 @@ using namespace nbl::video; auto cudaMemory = cudaDevice.createExportableMemory({ .size = 4096, .alignment = 4096, - .location = CU_MEM_LOCATION_TYPE_DEVICE, + .location = ECUDAMemoryLocation::DEVICE, }); if (!cudaMemory) return false; @@ -36,15 +36,16 @@ using namespace nbl::video; CUdeviceptr mappedVulkanMemory = 0; if (importedFromVulkan) - importedFromVulkan->getMappedBuffer(&mappedVulkanMemory); + cuda_native::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory); - const CUexternalSemaphore cudaSemaphore = importedSemaphore ? importedSemaphore->getInternalObject():nullptr; - return exportedToVulkan.get() && mappedVulkanMemory && cudaSemaphore; + const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(*cudaMemory); + const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(*importedSemaphore):nullptr; + return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; } bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) { - auto& cuda = handler.getCUDAFunctionTable(); + auto& cuda = cuda_native::getCUDAFunctionTable(handler); CUcontext context = nullptr; if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) @@ -83,7 +84,7 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) } } -class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework +class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework { using base_t = nbl::system::IApplicationFramework; @@ -95,13 +96,13 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework if (!isAPILoaded()) return false; - static_assert(std::is_same_v().getInternalObject()), CUdevice>); + static_assert(std::is_same_v())), CUdevice>); auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); if (!handler) return true; - const auto& devices = handler->getAvailableDevices(); + const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler); if (devices.empty()) return true; @@ -112,4 +113,4 @@ class CUDAInteropOptInSmoke final : public nbl::system::IApplicationFramework bool keepRunning() override { return false; } }; -NBL_MAIN_FUNC(CUDAInteropOptInSmoke) +NBL_MAIN_FUNC(CUDAInteropNativeOptInSmoke) From 49bcb2cf6c96e7fca42a16142c28ddc83686c579 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 16:40:53 +0200 Subject: [PATCH 092/149] Add native CUDA accessor overloads --- examples_tests | 2 +- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 172 ++++++++++++++++++ src/nbl/ext/CUDAInterop/README.md | 1 + .../ext/CUDAInterop/smoke/native_opt_in.cpp | 8 +- 4 files changed, 178 insertions(+), 5 deletions(-) diff --git a/examples_tests b/examples_tests index 7a2a4f604f..dfa2b7ac39 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 7a2a4f604fd941984d6624e3059f7380cc6592a2 +Subproject commit dfa2b7ac39c6b9ae94ae2eb70c8f6ec251a9715e diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index f913664122..ea6313f26b 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -146,6 +146,26 @@ struct SCUDADeviceInfo const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); +inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler) +{ + return getCUDAFunctionTable(*handler); +} + +inline const CUDA& getCUDAFunctionTable(const core::smart_refctd_ptr& handler) +{ + return getCUDAFunctionTable(*handler); +} + +inline const NVRTC& getNVRTCFunctionTable(const CCUDAHandler* handler) +{ + return getNVRTCFunctionTable(*handler); +} + +inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr& handler) +{ + return getNVRTCFunctionTable(*handler); +} + bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); @@ -155,12 +175,46 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } const core::vector& getAvailableDevices(const CCUDAHandler& handler); +inline const core::vector& getAvailableDevices(const CCUDAHandler* handler) +{ + return getAvailableDevices(*handler); +} + +inline const core::vector& getAvailableDevices(const core::smart_refctd_ptr& handler) +{ + return getAvailableDevices(*handler); +} + nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames); } nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); +inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) +{ + return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames); +} +inline nvrtcResult createProgram(const core::smart_refctd_ptr& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) +{ + return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames); +} +inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) +{ + return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames); +} +inline nvrtcResult createProgram(const core::smart_refctd_ptr& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) +{ + return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames); +} +inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) +{ + return createProgram(*handler,prog,file,headerCount,headerContents,includeNames); +} +inline nvrtcResult createProgram(const core::smart_refctd_ptr& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) +{ + return createProgram(*handler,prog,file,headerCount,headerContents,includeNames); +} nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); @@ -189,6 +243,54 @@ ptx_and_nvrtcResult_t compileDirectlyToPTX( const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr ); +inline ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler* handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +) +{ + return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); +} +inline ptx_and_nvrtcResult_t compileDirectlyToPTX( + const core::smart_refctd_ptr& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +) +{ + return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); +} +inline ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler* handler, const char* source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +) +{ + return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log); +} +inline ptx_and_nvrtcResult_t compileDirectlyToPTX( + const core::smart_refctd_ptr& handler, const char* source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +) +{ + return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log); +} +inline ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler* handler, system::IFile* file, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +) +{ + return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log); +} +inline ptx_and_nvrtcResult_t compileDirectlyToPTX( + const core::smart_refctd_ptr& handler, system::IFile* file, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr +) +{ + return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log); +} CUdevice getInternalObject(const CCUDADevice& device); CUcontext getContext(const CCUDADevice& device); @@ -198,6 +300,76 @@ CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); +inline CUdevice getInternalObject(const CCUDADevice* device) +{ + return getInternalObject(*device); +} + +inline CUdevice getInternalObject(const core::smart_refctd_ptr& device) +{ + return getInternalObject(*device); +} + +inline CUcontext getContext(const CCUDADevice* device) +{ + return getContext(*device); +} + +inline CUcontext getContext(const core::smart_refctd_ptr& device) +{ + return getContext(*device); +} + +inline size_t roundToGranularity(const CCUDADevice* device, CUmemLocationType location, size_t size) +{ + return roundToGranularity(*device,location,size); +} + +inline size_t roundToGranularity(const core::smart_refctd_ptr& device, CUmemLocationType location, size_t size) +{ + return roundToGranularity(*device,location,size); +} + +inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory) +{ + return getDeviceptr(*memory); +} + +inline CUdeviceptr getDeviceptr(const core::smart_refctd_ptr& memory) +{ + return getDeviceptr(*memory); +} + +inline CUexternalMemory getInternalObject(const CCUDAImportedMemory* memory) +{ + return getInternalObject(*memory); +} + +inline CUexternalMemory getInternalObject(const core::smart_refctd_ptr& memory) +{ + return getInternalObject(*memory); +} + +inline CUresult getMappedBuffer(const CCUDAImportedMemory* memory, CUdeviceptr* mappedBuffer) +{ + return getMappedBuffer(*memory,mappedBuffer); +} + +inline CUresult getMappedBuffer(const core::smart_refctd_ptr& memory, CUdeviceptr* mappedBuffer) +{ + return getMappedBuffer(*memory,mappedBuffer); +} + +inline CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore* semaphore) +{ + return getInternalObject(*semaphore); +} + +inline CUexternalSemaphore getInternalObject(const core::smart_refctd_ptr& semaphore) +{ + return getInternalObject(*semaphore); +} + } #define ASSERT_CUDA_SUCCESS(expr, handler) \ diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 1fd88d1b04..623c07ec9e 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -6,6 +6,7 @@ - Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=`. - A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target. - Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`. +- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers. ```cmake find_package(Nabla CONFIG REQUIRED) diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index d868b2eaa7..4c001ab6ce 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -36,10 +36,10 @@ using namespace nbl::video; CUdeviceptr mappedVulkanMemory = 0; if (importedFromVulkan) - cuda_native::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory); + cuda_native::getMappedBuffer(importedFromVulkan,&mappedVulkanMemory); - const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(*cudaMemory); - const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(*importedSemaphore):nullptr; + const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(cudaMemory); + const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(importedSemaphore):nullptr; return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; } @@ -102,7 +102,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!handler) return true; - const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler); + const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler); if (devices.empty()) return true; From d85657e381ecd537aa20a16ab227aa38754083d4 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 16:48:06 +0200 Subject: [PATCH 093/149] Document CUDA interop target split --- src/nbl/ext/CUDAInterop/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 623c07ec9e..a73b9d9c21 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -1,5 +1,8 @@ # CUDA Interop Targets +This extension keeps CUDA interop available without making CUDA a default public +compile-time dependency of Nabla. + - `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK. - `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK. - `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`. @@ -8,6 +11,18 @@ - Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`. - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers. +## Design + +- The default Nabla package remains relocatable and usable on machines without the CUDA SDK. +- CUDA is used privately to build the interop library. CUDA SDK headers become visible to consumers only when `CUDAInteropNative` is requested. +- Clean interop headers expose Nabla concepts such as devices, exported memory, imported memory, and imported semaphores. +- Native interop headers expose raw CUDA Driver API and NVRTC types for examples and applications that need direct CUDA work. +- The split is intentionally similar to the OpenCV CUDA shape: common CUDA-facing headers stay clean, while raw CUDA access lives behind explicit opt-in accessor/native headers. +- This avoids a transitive public compile-time dependency on CUDA while preserving the low-level workflow for kernels, `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, and external semaphores. +- Package consumers can pick their own compatible CUDA SDK for native code without rebuilding Nabla or the clean interop library. + +## Usage + ```cmake find_package(Nabla CONFIG REQUIRED) target_link_libraries(app PRIVATE Nabla::Nabla) From 6e8c4f99399b3111c2800a8ffd5f36cd9b17c418 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 16:58:35 +0200 Subject: [PATCH 094/149] Trim CUDA interop README wording --- src/nbl/ext/CUDAInterop/README.md | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index a73b9d9c21..104f7f2eca 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -1,25 +1,22 @@ # CUDA Interop Targets -This extension keeps CUDA interop available without making CUDA a default public -compile-time dependency of Nabla. - -- `Nabla::Nabla` stays CUDA-free. `find_package(Nabla CONFIG)` does not require the CUDA SDK. -- `Nabla::ext::CUDAInterop` is the clean Nabla interop target. Its public headers do not include `cuda.h` or `nvrtc.h`, so consumers can use a CUDA-enabled Nabla package without installing the CUDA SDK. -- `Nabla::ext::CUDAInteropNative` is the explicit raw CUDA opt-in target. It exposes `CUDAInteropNative.h`, CUDA Driver API and NVRTC types, and requires `CUDAToolkit`. -- Consumers can request native CUDA with `find_package(Nabla CONFIG COMPONENTS Core CUDAInteropNative)` and override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=`. -- A consumer can use a newer compatible local CUDA SDK through `CUDAInteropNative` without rebuilding Nabla or the clean `CUDAInterop` target. -- Rebuilds stay local: changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`. -- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`, so opt-in code can keep CUDA usage terse without moving CUDA types into clean headers. +- `Nabla::Nabla` does not require the CUDA SDK. +- `Nabla::ext::CUDAInterop` provides Nabla CUDA interop types. Its public headers do not include `cuda.h` or `nvrtc.h`. +- `Nabla::ext::CUDAInteropNative` provides raw CUDA Driver API and NVRTC access through `CUDAInteropNative.h`. +- `CUDAInteropNative` requires `CUDAToolkit`. `CUDAInterop` does not expose that requirement to consumers. +- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=` when requesting `CUDAInteropNative`. +- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla or `CUDAInterop`. +- Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`. +- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`. ## Design -- The default Nabla package remains relocatable and usable on machines without the CUDA SDK. -- CUDA is used privately to build the interop library. CUDA SDK headers become visible to consumers only when `CUDAInteropNative` is requested. -- Clean interop headers expose Nabla concepts such as devices, exported memory, imported memory, and imported semaphores. -- Native interop headers expose raw CUDA Driver API and NVRTC types for examples and applications that need direct CUDA work. -- The split is intentionally similar to the OpenCV CUDA shape: common CUDA-facing headers stay clean, while raw CUDA access lives behind explicit opt-in accessor/native headers. -- This avoids a transitive public compile-time dependency on CUDA while preserving the low-level workflow for kernels, `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, and external semaphores. -- Package consumers can pick their own compatible CUDA SDK for native code without rebuilding Nabla or the clean interop library. +- CUDA is used privately while building the interop library. +- CUDA SDK headers become visible to consumers only through `CUDAInteropNative`. +- `CUDAInterop` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores. +- `CUDAInteropNative` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects. +- The target split follows the same general dependency shape used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header. +- This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`. ## Usage From 881e9b83c19388647336d56ef438f07b66781641 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Wed, 6 May 2026 17:43:35 +0200 Subject: [PATCH 095/149] Move CUDA interop into Nabla --- cmake/NablaConfig.cmake.in | 17 +-- examples_tests | 2 +- include/nbl/ext/CUDAInterop/CCUDADevice.h | 2 +- .../ext/CUDAInterop/CCUDAExportableMemory.h | 2 +- include/nbl/ext/CUDAInterop/CCUDAHandler.h | 2 +- .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h | 2 +- .../ext/CUDAInterop/CCUDAImportedSemaphore.h | 2 +- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 40 +++---- src/nbl/CMakeLists.txt | 46 ++++---- src/nbl/ext/CMakeLists.txt | 8 -- src/nbl/ext/CUDAInterop/CMakeLists.txt | 51 ++------- src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp | 100 ++++++++++++++++++ src/nbl/ext/CUDAInterop/README.md | 28 +++-- src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 14 ++- .../ext/CUDAInterop/smoke/clean_opt_in.cpp | 4 +- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 2 +- 16 files changed, 183 insertions(+), 139 deletions(-) create mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index afff3dcccc..8b9f62e548 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -7,7 +7,6 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso set(_NBL_NABLA_LOAD_CORE OFF) set(_NBL_NABLA_LOAD_NSC OFF) set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF) -set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE OFF) set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS}) set(_NBL_NABLA_HAS_CORE_EXPORTS OFF) set(_NBL_NABLA_HAS_NSC_EXPORTS OFF) @@ -31,12 +30,6 @@ if(_NBL_NABLA_COMPONENTS) set(_NBL_NABLA_LOAD_CORE ON) set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) set(Nabla_CUDAInterop_FOUND TRUE) - elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInteropNative") - set(_NBL_NABLA_LOAD_CORE ON) - set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) - set(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE ON) - set(Nabla_CUDAInterop_FOUND TRUE) - set(Nabla_CUDAInteropNative_FOUND TRUE) else() set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE) endif() @@ -93,10 +86,6 @@ if(_NBL_NABLA_LOAD_NSC) endif() if(_NBL_NABLA_LOAD_CUDA_INTEROP) - _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) -endif() - -if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE) include(CMakeFindDependencyMacro) if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "") @@ -104,9 +93,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP_NATIVE) endif() find_dependency(CUDAToolkit 13.0 REQUIRED) - _nbl_try_include_component("CUDAInteropNative" "NablaCUDAInteropNativeExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND) - if(_NBL_NABLA_CUDA_INTEROP_NATIVE_FOUND AND TARGET Nabla::ext::CUDAInteropNative) - target_link_libraries(Nabla::ext::CUDAInteropNative INTERFACE CUDA::toolkit) + _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) + if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) + target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) endif() endif() diff --git a/examples_tests b/examples_tests index dfa2b7ac39..3b59c9bc05 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit dfa2b7ac39c6b9ae94ae2eb70c8f6ec251a9715e +Subproject commit 3b59c9bc05d8784277d3a18e11f423dcb8ae2b74 diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h index 25c40e7ed6..7b994e053f 100644 --- a/include/nbl/ext/CUDAInterop/CCUDADevice.h +++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h @@ -22,7 +22,7 @@ namespace cuda_native struct SAccess; } -class CCUDADevice : public core::IReferenceCounted +class NBL_API2 CCUDADevice : public core::IReferenceCounted { public: struct SNativeState; diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h index 5973c31fac..b331d6a258 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h @@ -26,7 +26,7 @@ enum class ECUDAMemoryLocation : uint32_t HOST_NUMA_CURRENT = 4 }; -class CCUDAExportableMemory : public core::IReferenceCounted +class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { public: struct SNativeState; diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h index 063598a518..6a3cc6c496 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h +++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h @@ -25,7 +25,7 @@ namespace cuda_native struct SAccess; } -class CCUDAHandler : public core::IReferenceCounted +class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: struct SNativeState; diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h index 8a24f83907..adb803f12c 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h @@ -16,7 +16,7 @@ namespace cuda_native struct SAccess; } -class CCUDAImportedMemory : public core::IReferenceCounted +class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted { public: struct SNativeState; diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h index 3ee03fb045..894f2444c0 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h @@ -19,7 +19,7 @@ namespace cuda_native struct SAccess; } -class CCUDAImportedSemaphore : public core::IReferenceCounted +class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { public: struct SNativeState; diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index ea6313f26b..b73f2ae252 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -143,8 +143,8 @@ struct SCUDADeviceInfo int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; }; -const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); -const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); +NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); +NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler) { @@ -166,14 +166,14 @@ inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } -const core::vector& getAvailableDevices(const CCUDAHandler& handler); +NBL_API2 const core::vector& getAvailableDevices(const CCUDAHandler& handler); inline const core::vector& getAvailableDevices(const CCUDAHandler* handler) { @@ -185,12 +185,12 @@ inline const core::vector& getAvailableDevices(const core::smar return getAvailableDevices(*handler); } -nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); +NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames); } -nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); +NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames); @@ -215,8 +215,8 @@ inline nvrtcResult createProgram(const core::smart_refctd_ptr& han { return createProgram(*handler,prog,file,headerCount,headerContents,includeNames); } -nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); -nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); +NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); +NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); struct ptx_and_nvrtcResult_t { @@ -224,8 +224,8 @@ struct ptx_and_nvrtcResult_t nvrtcResult result; }; -ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog); -ptx_and_nvrtcResult_t compileDirectlyToPTX( +NBL_API2 ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog); +NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX( CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr @@ -238,7 +238,7 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX( { return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); } -ptx_and_nvrtcResult_t compileDirectlyToPTX( +NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX( CCUDAHandler& handler, system::IFile* file, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr @@ -292,13 +292,13 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX( return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log); } -CUdevice getInternalObject(const CCUDADevice& device); -CUcontext getContext(const CCUDADevice& device); -size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); -CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); -CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); -CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); -CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); +NBL_API2 CUdevice getInternalObject(const CCUDADevice& device); +NBL_API2 CUcontext getContext(const CCUDADevice& device); +NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); +NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); +NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); +NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); +NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); inline CUdevice getInternalObject(const CCUDADevice* device) { diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index ecf7f555c3..f0f7b275c0 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -124,6 +124,20 @@ set(NBL_CORE_SOURCES core/alloc/refctd_memory_resource.cpp core/hash/blake.cpp ) + +set(NBL_CUDA_INTEROP_SOURCES + ext/CUDAInterop/CUDAInteropStubs.cpp +) +if(NBL_COMPILE_WITH_CUDA) + set(NBL_CUDA_INTEROP_SOURCES + ext/CUDAInterop/CCUDADevice.cpp + ext/CUDAInterop/CCUDAExportableMemory.cpp + ext/CUDAInterop/CCUDAHandler.cpp + ext/CUDAInterop/CCUDAImportedMemory.cpp + ext/CUDAInterop/CCUDAImportedSemaphore.cpp + ) +endif() + set(NBL_SYSTEM_SOURCES system/DefaultFuncPtrLoader.cpp system/IFileBase.cpp @@ -306,6 +320,7 @@ set(NABLA_SRCS_COMMON ${NBL_VIDEO_SOURCES} ${NBL_SCENE_SOURCES} ${NBL_META_SOURCES} + ${NBL_CUDA_INTEROP_SOURCES} ) if(MSVC) @@ -416,6 +431,11 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() +if(NBL_COMPILE_WITH_CUDA) + target_compile_definitions(Nabla PRIVATE _NBL_COMPILE_WITH_CUDA_) + target_include_directories(Nabla PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) +endif() + set(INTERFACE_BUILD_DEFINITIONS _DXC_DLL_="${DXC_DLL}" ) @@ -783,35 +803,17 @@ add_subdirectory(ext EXCLUDE_FROM_ALL) propagate_changed_variables_to_parent_scope() if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB}) - set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF) - - set(_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS) - if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) - list(APPEND _NBL_EXT_CUDA_INTEROP_INSTALL_ARGS EXPORT NablaCUDAInteropExportTargets) - endif() - nbl_install_lib_spec(${NBL_EXT_CUDA_INTEROP_LIB} "nbl/ext/CUDA_INTEROP" ${_NBL_EXT_CUDA_INTEROP_INSTALL_ARGS}) - if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) + install(TARGETS ${NBL_EXT_CUDA_INTEROP_LIB} + EXPORT NablaCUDAInteropExportTargets + COMPONENT Libraries + ) install(EXPORT NablaCUDAInteropExportTargets NAMESPACE Nabla:: DESTINATION cmake COMPONENT Libraries ) endif() - - if(DEFINED NBL_EXT_CUDA_INTEROP_NATIVE_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}) - if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) - install(TARGETS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} - EXPORT NablaCUDAInteropNativeExportTargets - COMPONENT Libraries - ) - install(EXPORT NablaCUDAInteropNativeExportTargets - NAMESPACE Nabla:: - DESTINATION cmake - COMPONENT Libraries - ) - endif() - endif() endif() if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB}) diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt index 59ae49285e..264cfc7c2d 100644 --- a/src/nbl/ext/CMakeLists.txt +++ b/src/nbl/ext/CMakeLists.txt @@ -40,18 +40,10 @@ endif() add_subdirectory(CUDAInterop) if (NBL_COMPILE_WITH_CUDA) - set(NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS - ${NBL_EXT_CUDA_INTEROP_INCLUDE_DIRS} - PARENT_SCOPE - ) set(NBL_EXT_CUDA_INTEROP_LIB ${NBL_EXT_CUDA_INTEROP_LIB} PARENT_SCOPE ) - set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB - ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} - PARENT_SCOPE - ) endif() if (NBL_BUILD_IMGUI) diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt index 973fbb232a..438ab51d8f 100644 --- a/src/nbl/ext/CUDAInterop/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -1,50 +1,17 @@ include(${NBL_ROOT_PATH}/cmake/common.cmake) if (NBL_COMPILE_WITH_CUDA) - set(NBL_EXT_INTERNAL_INCLUDE_DIR "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop") + set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP") - set(NBL_EXT_CUDA_INTEROP_H - ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInterop.h - ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CUDAInteropNative.h - ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDADevice.h - ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAExportableMemory.h - ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAHandler.h - ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedMemory.h - ${NBL_EXT_INTERNAL_INCLUDE_DIR}/CCUDAImportedSemaphore.h + add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE) + target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE + $ + $ + $ ) - - set(NBL_EXT_CUDA_INTEROP_SRC - CCUDADevice.cpp - CCUDAExportableMemory.cpp - CCUDAHandler.cpp - CCUDAImportedMemory.cpp - CCUDAImportedSemaphore.cpp - ) - - nbl_create_ext_library_project( - CUDA_INTEROP - "${NBL_EXT_CUDA_INTEROP_H}" - "${NBL_EXT_CUDA_INTEROP_SRC}" - "" - "" - "" - ) - - target_compile_definitions(${LIB_NAME} PRIVATE _NBL_COMPILE_WITH_CUDA_) - target_include_directories(${LIB_NAME} PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) - set_target_properties(${LIB_NAME} PROPERTIES EXPORT_NAME "ext::CUDAInterop") - add_library(Nabla::ext::CUDAInterop ALIAS ${LIB_NAME}) - - set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "NblExtCUDA_INTEROP_NATIVE") - add_library(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE) - target_link_libraries(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} INTERFACE - $ - $ - CUDA::toolkit - ) - set_target_properties(${NBL_EXT_CUDA_INTEROP_NATIVE_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInteropNative") - add_library(Nabla::ext::CUDAInteropNative ALIAS ${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}) - set(NBL_EXT_CUDA_INTEROP_NATIVE_LIB "${NBL_EXT_CUDA_INTEROP_NATIVE_LIB}" PARENT_SCOPE) + set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop") + add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB}) + set(NBL_EXT_CUDA_INTEROP_LIB "${NBL_EXT_CUDA_INTEROP_LIB}" PARENT_SCOPE) endif() add_subdirectory(smoke) diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp new file mode 100644 index 0000000000..db2b068391 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp @@ -0,0 +1,100 @@ +#include "nbl/ext/CUDAInterop/CUDAInterop.h" + +namespace nbl::video +{ + +struct CCUDAHandler::SNativeState {}; +struct CCUDADevice::SNativeState {}; +struct CCUDAExportableMemory::SNativeState {}; +struct CCUDAImportedMemory::SNativeState {}; +struct CCUDAImportedSemaphore::SNativeState {}; + +CCUDAHandler::CCUDAHandler( + std::unique_ptr&& nativeState, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger, + int _version) + : m_native(std::move(nativeState)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) + , m_version(_version) +{} + +CCUDAHandler::~CCUDAHandler() = default; + +core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&&, IPhysicalDevice*) +{ + return nullptr; +} + +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + std::unique_ptr&& nativeState, + core::smart_refctd_ptr&& handler) + : m_logger(nullptr) + , m_vulkanConnection(std::move(vulkanConnection)) + , m_physicalDevice(vulkanDevice) + , m_virtualArchitecture(virtualArchitecture) + , m_handler(std::move(handler)) + , m_native(std::move(nativeState)) +{} + +CCUDADevice::~CCUDADevice() = default; + +size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const +{ + return size; +} + +core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&&) +{ + return nullptr; +} + +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{} + +CCUDAExportableMemory::~CCUDAExportableMemory() = default; + +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const +{ + return nullptr; +} + +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{} + +CCUDAImportedMemory::~CCUDAImportedMemory() = default; + +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{} + +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default; + +} diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 104f7f2eca..6eee617714 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -1,21 +1,22 @@ # CUDA Interop Targets - `Nabla::Nabla` does not require the CUDA SDK. -- `Nabla::ext::CUDAInterop` provides Nabla CUDA interop types. Its public headers do not include `cuda.h` or `nvrtc.h`. -- `Nabla::ext::CUDAInteropNative` provides raw CUDA Driver API and NVRTC access through `CUDAInteropNative.h`. -- `CUDAInteropNative` requires `CUDAToolkit`. `CUDAInterop` does not expose that requirement to consumers. -- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=` when requesting `CUDAInteropNative`. -- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla or `CUDAInterop`. +- `Nabla::Nabla` provides Nabla CUDA interop types when the package was built with CUDA support. +- Nabla CUDA interop public headers do not include `cuda.h` or `nvrtc.h`. +- `Nabla::ext::CUDAInterop` is the raw CUDA Driver API and NVRTC opt-in target. +- `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`. +- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=` when requesting `CUDAInterop`. +- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla. - Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`. - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`. ## Design -- CUDA is used privately while building the interop library. -- CUDA SDK headers become visible to consumers only through `CUDAInteropNative`. -- `CUDAInterop` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores. -- `CUDAInteropNative` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects. -- The target split follows the same general dependency shape used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header. +- CUDA is used privately while building `Nabla::Nabla`. +- CUDA SDK headers become visible to consumers only through `Nabla::ext::CUDAInterop`. +- `Nabla::Nabla` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores. +- `Nabla::ext::CUDAInterop` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects. +- The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header. - This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`. ## Usage @@ -27,10 +28,5 @@ target_link_libraries(app PRIVATE Nabla::Nabla) ```cmake find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop) -target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop) -``` - -```cmake -find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInteropNative) -target_link_libraries(app PRIVATE Nabla::ext::CUDAInteropNative) +target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop) ``` diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index 71bdac260d..bdda95fb03 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -4,9 +4,9 @@ project(NblExtCUDAInteropSmoke CXX) option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF) if(NOT TARGET Nabla::Nabla) - set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core CUDAInterop) + set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core) if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE) - list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInteropNative) + list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInterop) endif() find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS}) endif() @@ -23,12 +23,10 @@ endfunction() nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp) target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla) -if(TARGET Nabla::ext::CUDAInterop) - nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanOptInSmoke clean_opt_in.cpp) - target_link_libraries(NblExtCUDAInteropCleanOptInSmoke PRIVATE Nabla::ext::CUDAInterop) -endif() +nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanNablaSmoke clean_opt_in.cpp) +target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla) -if(TARGET Nabla::ext::CUDAInteropNative) +if(TARGET Nabla::ext::CUDAInterop) nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp) - target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInteropNative) + target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInterop) endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp index 6952433f9e..348caa766e 100644 --- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp @@ -4,11 +4,11 @@ #include #ifdef _NBL_COMPILE_WITH_CUDA_ -#error "Nabla::ext::CUDAInterop must not propagate the CUDA build define." +#error "Nabla::Nabla must not propagate the CUDA build define." #endif #ifdef CUDA_VERSION -#error "Nabla::ext::CUDAInterop must not require CUDA SDK headers." +#error "Nabla::Nabla must not require CUDA SDK headers." #endif namespace diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 4c001ab6ce..a78f710040 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -8,7 +8,7 @@ #include #ifndef CUDA_VERSION -#error "Nabla::ext::CUDAInteropNative must expose CUDA SDK headers." +#error "Nabla::ext::CUDAInterop must expose CUDA SDK headers." #endif namespace From 5dd1134ffc7d144e24f0ee3a55a283025b01fed8 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 06:15:03 +0200 Subject: [PATCH 096/149] Document CUDA interop accessor model --- src/nbl/ext/CUDAInterop/README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 6eee617714..a7c1e654be 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -19,6 +19,14 @@ - The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header. - This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`. +## OpenCV Reference + +- OpenCV's common CUDA header includes OpenCV headers, not raw CUDA SDK headers: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L51-L52). +- OpenCV keeps the public stream type as an OpenCV abstraction and grants access through `StreamAccessor`: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L916-L979). +- OpenCV's raw CUDA opt-in header says it is the only header that depends on the CUDA Runtime API, then includes `` and exposes accessor types: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79). +- OpenCV also keeps implementation CUDA headers private and includes `` / `` there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61). +- The same split is used here: Nabla CUDA objects stay in `Nabla::Nabla`, and raw CUDA handles/functions are available only after including `CUDAInteropNative.h` and linking `Nabla::ext::CUDAInterop`. + ## Usage ```cmake From e514df7f505bbc168f13ccc50750a62c2e6680bf Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 06:41:06 +0200 Subject: [PATCH 097/149] Inline CUDA interop stubs --- src/nbl/CMakeLists.txt | 15 +-- src/nbl/ext/CUDAInterop/CCUDADevice.cpp | 50 ++++++++- .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 27 ++++- src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 38 ++++++- .../ext/CUDAInterop/CCUDAImportedMemory.cpp | 21 +++- .../CUDAInterop/CCUDAImportedSemaphore.cpp | 22 +++- src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp | 100 ------------------ src/nbl/ext/CUDAInterop/README.md | 12 +++ 8 files changed, 169 insertions(+), 116 deletions(-) delete mode 100644 src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index f0f7b275c0..ccb600ca32 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -126,17 +126,12 @@ set(NBL_CORE_SOURCES ) set(NBL_CUDA_INTEROP_SOURCES - ext/CUDAInterop/CUDAInteropStubs.cpp + ext/CUDAInterop/CCUDADevice.cpp + ext/CUDAInterop/CCUDAExportableMemory.cpp + ext/CUDAInterop/CCUDAHandler.cpp + ext/CUDAInterop/CCUDAImportedMemory.cpp + ext/CUDAInterop/CCUDAImportedSemaphore.cpp ) -if(NBL_COMPILE_WITH_CUDA) - set(NBL_CUDA_INTEROP_SOURCES - ext/CUDAInterop/CCUDADevice.cpp - ext/CUDAInterop/CCUDAExportableMemory.cpp - ext/CUDAInterop/CCUDAHandler.cpp - ext/CUDAInterop/CCUDAImportedMemory.cpp - ext/CUDAInterop/CCUDAImportedSemaphore.cpp - ) -endif() set(NBL_SYSTEM_SOURCES system/DefaultFuncPtrLoader.cpp diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp index 5f59545173..7d002c86ca 100644 --- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp @@ -1,13 +1,15 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +#include "nbl/ext/CUDAInterop/CUDAInterop.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ #include "CUDAInteropNativeState.hpp" #ifdef _WIN32 #include #endif -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { @@ -241,4 +243,50 @@ CCUDADevice::~CCUDADevice() } +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDADevice::SNativeState {}; + +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + std::unique_ptr&& nativeState, + core::smart_refctd_ptr&& handler) + : m_logger(nullptr) + , m_vulkanConnection(std::move(vulkanConnection)) + , m_physicalDevice(vulkanDevice) + , m_virtualArchitecture(virtualArchitecture) + , m_handler(std::move(handler)) + , m_native(std::move(nativeState)) +{} + +CCUDADevice::~CCUDADevice() = default; + +size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const +{ + return size; +} + +core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&&) +{ + return nullptr; +} + +} + #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp index 94d18c40bb..a89e42b2f6 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp @@ -2,9 +2,11 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "CUDAInteropNativeState.hpp" +#include "nbl/ext/CUDAInterop/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + namespace nbl::video { @@ -66,4 +68,27 @@ CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory) } } +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAExportableMemory::SNativeState {}; + +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{} + +CCUDAExportableMemory::~CCUDAExportableMemory() = default; + +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const +{ + return nullptr; +} + +} + #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index 49e36083d4..51f0656f6c 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -2,10 +2,11 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "CUDAInteropNativeState.hpp" -#include "nbl/system/CFileView.h" +#include "nbl/ext/CUDAInterop/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" +#include "nbl/system/CFileView.h" #include "jitify/jitify.hpp" @@ -769,4 +770,37 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct } +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAHandler::SNativeState {}; + +CCUDAHandler::CCUDAHandler( + std::unique_ptr&& nativeState, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger, + int _version) + : m_native(std::move(nativeState)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) + , m_version(_version) +{} + +CCUDAHandler::~CCUDAHandler() = default; + +core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&&, IPhysicalDevice*) +{ + return nullptr; +} + +} + #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp index bbc65f91ab..8de3ce3e63 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp @@ -2,9 +2,10 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "CUDAInteropNativeState.hpp" +#include "nbl/ext/CUDAInterop/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" namespace nbl::video { @@ -44,4 +45,22 @@ CCUDAImportedMemory::~CCUDAImportedMemory() } +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedMemory::SNativeState {}; + +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{} + +CCUDAImportedMemory::~CCUDAImportedMemory() = default; + +} + #endif diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp index b6e3b319f7..fdbb56b0cf 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp @@ -2,9 +2,11 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "CUDAInteropNativeState.hpp" +#include "nbl/ext/CUDAInterop/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + namespace nbl::video { CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) @@ -30,4 +32,22 @@ CCUDAImportedSemaphore::~CCUDAImportedSemaphore() } } +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedSemaphore::SNativeState {}; + +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{} + +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default; + +} + #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp b/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp deleted file mode 100644 index db2b068391..0000000000 --- a/src/nbl/ext/CUDAInterop/CUDAInteropStubs.cpp +++ /dev/null @@ -1,100 +0,0 @@ -#include "nbl/ext/CUDAInterop/CUDAInterop.h" - -namespace nbl::video -{ - -struct CCUDAHandler::SNativeState {}; -struct CCUDADevice::SNativeState {}; -struct CCUDAExportableMemory::SNativeState {}; -struct CCUDAImportedMemory::SNativeState {}; -struct CCUDAImportedSemaphore::SNativeState {}; - -CCUDAHandler::CCUDAHandler( - std::unique_ptr&& nativeState, - core::vector>&& _headers, - core::smart_refctd_ptr&& _logger, - int _version) - : m_native(std::move(nativeState)) - , m_headers(std::move(_headers)) - , m_logger(std::move(_logger)) - , m_version(_version) -{} - -CCUDAHandler::~CCUDAHandler() = default; - -core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) -{ - return nullptr; -} - -core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&&, IPhysicalDevice*) -{ - return nullptr; -} - -CCUDADevice::CCUDADevice( - core::smart_refctd_ptr&& vulkanConnection, - IPhysicalDevice* const vulkanDevice, - const E_VIRTUAL_ARCHITECTURE virtualArchitecture, - std::unique_ptr&& nativeState, - core::smart_refctd_ptr&& handler) - : m_logger(nullptr) - , m_vulkanConnection(std::move(vulkanConnection)) - , m_physicalDevice(vulkanDevice) - , m_virtualArchitecture(virtualArchitecture) - , m_handler(std::move(handler)) - , m_native(std::move(nativeState)) -{} - -CCUDADevice::~CCUDADevice() = default; - -size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const -{ - return size; -} - -core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&) -{ - return nullptr; -} - -core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) -{ - return nullptr; -} - -core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&&) -{ - return nullptr; -} - -CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) - : m_device(std::move(device)) - , m_params(std::move(params)) - , m_native(std::move(nativeState)) -{} - -CCUDAExportableMemory::~CCUDAExportableMemory() = default; - -core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const -{ - return nullptr; -} - -CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) - : m_device(std::move(device)) - , m_src(std::move(src)) - , m_native(std::move(nativeState)) -{} - -CCUDAImportedMemory::~CCUDAImportedMemory() = default; - -CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) - : m_device(std::move(device)) - , m_src(std::move(src)) - , m_native(std::move(nativeState)) -{} - -CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default; - -} diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index a7c1e654be..407b5e81b3 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -38,3 +38,15 @@ target_link_libraries(app PRIVATE Nabla::Nabla) find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop) target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop) ``` + +## Properties + +- `Nabla::Nabla` can be built with CUDA support without making CUDA SDK headers a public compile-time requirement. +- Consumers that only link `Nabla::Nabla` do not need a CUDA SDK to parse Nabla headers. +- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop` explicitly. +- Raw CUDA access is not wrapped away. Native code can use CUDA Driver API types, NVRTC types, and Nabla native accessors in the opt-in path. +- The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds. +- CUDA OFF implementations are local stubs in the same `.cpp` files. Clean API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`. +- A package built with CUDA support can be consumed without a local CUDA SDK unless the `CUDAInterop` component is requested. +- A consumer can use a compatible local CUDA SDK for native interop without rebuilding Nabla. From e53c838207aaf5f15513e6e622038d852154cfbb Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 09:25:29 +0200 Subject: [PATCH 098/149] Refine CUDA interop boundary --- examples_tests | 2 +- include/nbl/ext/CUDAInterop/CCUDADevice.h | 4 -- .../ext/CUDAInterop/CCUDAExportableMemory.h | 18 +----- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 21 +++++++ src/nbl/ext/CUDAInterop/CCUDADevice.cpp | 60 +++++++++--------- .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 12 ++-- src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 52 ++++++++++++++- .../CUDAInterop/CUDAInteropNativeState.hpp | 10 --- src/nbl/ext/CUDAInterop/README.md | 63 ++++++++++--------- .../ext/CUDAInterop/smoke/clean_opt_in.cpp | 13 ++-- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 4 +- 11 files changed, 147 insertions(+), 112 deletions(-) diff --git a/examples_tests b/examples_tests index 3b59c9bc05..fbb82d36e0 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 3b59c9bc05d8784277d3a18e11f423dcb8ae2b74 +Subproject commit fbb82d36e0f767e867a477a9d1a7035c7cbd56ca diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h index 7b994e053f..12465f40f4 100644 --- a/include/nbl/ext/CUDAInterop/CCUDADevice.h +++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h @@ -81,10 +81,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } - size_t roundToGranularity(ECUDAMemoryLocation location, size_t size) const; - - core::smart_refctd_ptr createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); - core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h index b331d6a258..80a9b3630a 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h @@ -18,36 +18,22 @@ namespace cuda_native struct SAccess; } -enum class ECUDAMemoryLocation : uint32_t -{ - DEVICE = 1, - HOST = 2, - HOST_NUMA = 3, - HOST_NUMA_CURRENT = 4 -}; - class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { public: struct SNativeState; - struct SCreationParams + struct SCachedCreationParams { size_t size; uint32_t alignment; - ECUDAMemoryLocation location; - }; - - struct SCachedCreationParams : SCreationParams - { size_t granularSize; external_handle_t externalHandle; + bool deviceLocal; }; CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); ~CCUDAExportableMemory() override; - const SCreationParams& getCreationParams() const { return m_params; } - core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; private: diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index b73f2ae252..dd87d93e43 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -18,6 +18,9 @@ namespace nbl::video::cuda_native { +inline constexpr int MinimumCUDADriverVersion = 13000; +inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; + using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader @@ -143,6 +146,13 @@ struct SCUDADeviceInfo int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; }; +struct SExportableMemoryCreationParams +{ + size_t size; + uint32_t alignment; + CUmemLocationType location; +}; + NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); @@ -295,6 +305,7 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX( NBL_API2 CUdevice getInternalObject(const CCUDADevice& device); NBL_API2 CUcontext getContext(const CCUDADevice& device); NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); +NBL_API2 core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params); NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); @@ -330,6 +341,16 @@ inline size_t roundToGranularity(const core::smart_refctd_ptr& devi return roundToGranularity(*device,location,size); } +inline core::smart_refctd_ptr createExportableMemory(CCUDADevice* device, SExportableMemoryCreationParams&& params) +{ + return createExportableMemory(*device,std::move(params)); +} + +inline core::smart_refctd_ptr createExportableMemory(const core::smart_refctd_ptr& device, SExportableMemoryCreationParams&& params) +{ + return createExportableMemory(*device,std::move(params)); +} + inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory) { return getDeviceptr(*memory); diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp index 7d002c86ca..ebac00b7b4 100644 --- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp @@ -58,11 +58,6 @@ CCUDADevice::CCUDADevice( } } -size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation location, size_t size) const -{ - return cuda_native::roundToGranularity(*this,cuda_native::toNative(location),size); -} - namespace cuda_native { @@ -84,6 +79,11 @@ size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, } +static bool isDeviceLocal(CUmemLocationType location) +{ + return location==CU_MEM_LOCATION_TYPE_DEVICE; +} + static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) { const auto handler = device.getHandler(); @@ -117,12 +117,23 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept return CUDA_SUCCESS; } -core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) +namespace cuda_native { - CCUDAExportableMemory::SCachedCreationParams params = { inParams }; - auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); - const auto nativeLocation = cuda_native::toNative(params.location); +core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams) +{ + const auto handler = device.getHandler(); + auto& native = SAccess::native(device); + auto logger = SAccess::logger(device); + + CCUDAExportableMemory::SCachedCreationParams params = { + .size = inParams.size, + .alignment = inParams.alignment, + .granularSize = roundToGranularity(device, inParams.location, inParams.size), + .deviceLocal = isDeviceLocal(inParams.location) + }; + + auto& cu = getCUDAFunctionTable(*handler); #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { @@ -132,35 +143,34 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = cuda_native::getAllocationHandleType(), - .location = { .type = nativeLocation, .id = m_native->handle }, + .requestedHandleTypes = getAllocationHandleType(), + .location = { .type = inParams.location, .id = native.handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - params.granularSize = roundToGranularity(params.location, params.size); auto nativeState = std::make_unique(); CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) { - m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); + logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); return nullptr; } if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { - m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler); return nullptr; } - if (const auto err = reserveAddressAndMapMemory(*this,&nativeState->ptr, params.granularSize, params.alignment, nativeLocation, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(device,&nativeState->ptr, params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err) { - m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); + logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler); bool closeSucceed = CloseExternalHandle(params.externalHandle); assert(closeSucceed); @@ -175,7 +185,9 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), std::move(nativeState)); + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(&device), std::move(params), std::move(nativeState)); +} + } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) @@ -267,16 +279,6 @@ CCUDADevice::CCUDADevice( CCUDADevice::~CCUDADevice() = default; -size_t CCUDADevice::roundToGranularity(ECUDAMemoryLocation, size_t size) const -{ - return size; -} - -core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&&) -{ - return nullptr; -} - core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) { return nullptr; diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp index a89e42b2f6..a65d1b680c 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp @@ -22,14 +22,10 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); - switch (m_params.location) - { - case ECUDAMemoryLocation::DEVICE: memoryTypeBits &= vram; break; - case ECUDAMemoryLocation::HOST_NUMA: - case ECUDAMemoryLocation::HOST_NUMA_CURRENT: - case ECUDAMemoryLocation::HOST: memoryTypeBits &= ~vram; break; - default: break; - } + if (m_params.deviceLocal) + memoryTypeBits &= vram; + else + memoryTypeBits &= ~vram; IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; req.size = m_params.granularSize; diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index 51f0656f6c..777a1db14a 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -12,6 +12,21 @@ namespace nbl::video { + +namespace +{ + +int cudaVersionMajor(int version) +{ + return version/1000; +} + +int cudaVersionMinor(int version) +{ + return (version%1000)/10; +} + +} CCUDAHandler::CCUDAHandler( std::unique_ptr&& nativeState, @@ -455,6 +470,8 @@ bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr&& _logger) { + const system::logger_opt_ptr logger(_logger.get()); + cuda_native::CUDA cuda = cuda_native::CUDA( #if defined(_NBL_WINDOWS_API_) "nvcuda" @@ -502,18 +519,32 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste #define SAFE_CUDA_CALL(FUNC,...) \ {\ if (!cuda.p ## FUNC)\ + {\ + logger.log("CCUDAHandler: CUDA Driver API function %s was not found. Need CUDA driver runtime %d.%d or newer.",system::ILogger::ELL_ERROR,#FUNC,cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion));\ return nullptr;\ + }\ auto result = cuda.p ## FUNC(__VA_ARGS__);\ if (result!=CUDA_SUCCESS)\ + {\ + logger.log("CCUDAHandler: %s failed with CUDA error code %d.",system::ILogger::ELL_ERROR,#FUNC,static_cast(result));\ return nullptr;\ + }\ } SAFE_CUDA_CALL(cuInit,0) int cudaVersion = 0; SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) - if (cudaVersion<13000) + if (cudaVersion CCUDAHandler::create(system::ISystem* syste // check nvrtc existence and compatibility if (!nvrtc.pnvrtcVersion) + { + logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion); return nullptr; + } int nvrtcVersion[2] = { -1,-1 }; - nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1); - if (nvrtcVersion[0]<9) + const auto nvrtcVersionResult = nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1); + if (nvrtcVersionResult!=NVRTC_SUCCESS) + { + logger.log("CCUDAHandler: nvrtcVersion failed with NVRTC error code %d.",system::ILogger::ELL_ERROR,static_cast(nvrtcVersionResult)); return nullptr; + } + if (nvrtcVersion[0]> headers; diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp index 2dc3c3bbca..47701359ba 100644 --- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp +++ b/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp @@ -57,16 +57,6 @@ struct CCUDAImportedSemaphore::SNativeState namespace cuda_native { -inline CUmemLocationType toNative(ECUDAMemoryLocation location) -{ - return static_cast(static_cast(location)); -} - -inline ECUDAMemoryLocation toNabla(CUmemLocationType location) -{ - return static_cast(static_cast(location)); -} - inline CUmemAllocationHandleType getAllocationHandleType() { #ifdef _WIN32 diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 407b5e81b3..cf3a89cdd1 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -1,32 +1,12 @@ # CUDA Interop Targets -- `Nabla::Nabla` does not require the CUDA SDK. -- `Nabla::Nabla` provides Nabla CUDA interop types when the package was built with CUDA support. -- Nabla CUDA interop public headers do not include `cuda.h` or `nvrtc.h`. -- `Nabla::ext::CUDAInterop` is the raw CUDA Driver API and NVRTC opt-in target. +- `Nabla::Nabla` owns the CUDA interop implementation and exported symbols. +- `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`. +- `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target. - `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`. - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=` when requesting `CUDAInterop`. -- Consumers can build native CUDA code against a compatible local SDK without rebuilding Nabla. -- Changing CUDA SDK headers affects only targets that include `CUDAInteropNative.h`. - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`. -## Design - -- CUDA is used privately while building `Nabla::Nabla`. -- CUDA SDK headers become visible to consumers only through `Nabla::ext::CUDAInterop`. -- `Nabla::Nabla` exposes Nabla concepts such as devices, exported memory, imported memory, and imported semaphores. -- `Nabla::ext::CUDAInterop` exposes CUDA types such as `CUdeviceptr`, `CUmodule`, `CUfunction`, external memory, external semaphores, and NVRTC objects. -- The dependency shape follows the same general model used by libraries such as OpenCV: common CUDA-facing APIs do not force raw CUDA headers on every consumer, while raw CUDA access is available through an explicit opt-in header. -- This avoids a transitive public compile-time dependency on CUDA from `Nabla::Nabla`. - -## OpenCV Reference - -- OpenCV's common CUDA header includes OpenCV headers, not raw CUDA SDK headers: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L51-L52). -- OpenCV keeps the public stream type as an OpenCV abstraction and grants access through `StreamAccessor`: [`cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda.hpp#L916-L979). -- OpenCV's raw CUDA opt-in header says it is the only header that depends on the CUDA Runtime API, then includes `` and exposes accessor types: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79). -- OpenCV also keeps implementation CUDA headers private and includes `` / `` there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61). -- The same split is used here: Nabla CUDA objects stay in `Nabla::Nabla`, and raw CUDA handles/functions are available only after including `CUDAInteropNative.h` and linking `Nabla::ext::CUDAInterop`. - ## Usage ```cmake @@ -35,18 +15,39 @@ target_link_libraries(app PRIVATE Nabla::Nabla) ``` ```cmake -find_package(Nabla CONFIG REQUIRED COMPONENTS Core CUDAInterop) +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop) ``` +```cpp +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" + +auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, { + .size = size, + .alignment = alignment, + .location = CU_MEM_LOCATION_TYPE_DEVICE, +}); +``` + ## Properties -- `Nabla::Nabla` can be built with CUDA support without making CUDA SDK headers a public compile-time requirement. -- Consumers that only link `Nabla::Nabla` do not need a CUDA SDK to parse Nabla headers. -- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop` explicitly. -- Raw CUDA access is not wrapped away. Native code can use CUDA Driver API types, NVRTC types, and Nabla native accessors in the opt-in path. +- Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers. +- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop`. +- Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly. +- CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI. +- The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs. +- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. +- `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor. - The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds. -- CUDA OFF implementations are local stubs in the same `.cpp` files. Clean API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. - CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`. -- A package built with CUDA support can be consumed without a local CUDA SDK unless the `CUDAInterop` component is requested. -- A consumer can use a compatible local CUDA SDK for native interop without rebuilding Nabla. + +## Related Designs + +- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79). +- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61). +- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27). +- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: [`device_impl.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30), [`device.cpp`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.cpp#L10-L48). +- ONNX Runtime keeps accelerator dependencies behind execution providers and supports provider shared libraries loaded only when requested: [`Build with Execution Providers`](https://onnxruntime.ai/docs/build/eps.html#execution-provider-shared-libraries). +- ggml/llama.cpp keeps the generic backend API separate from CUDA and builds CUDA as an explicit backend target with CUDA libraries linked to that backend: [`ggml-backend.h`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/include/ggml-backend.h#L1488-L1499), [`ggml-cuda CMakeLists.txt`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-cuda/CMakeLists.txt#L982-L1072). +- TensorFlow PluggableDevice uses separate device plugin packages so accelerator toolchains and dependencies do not become core TensorFlow requirements: [`PluggableDevice`](https://blog.tensorflow.org/2021/06/pluggabledevice-device-plugins-for-TensorFlow.html). diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp index 348caa766e..e36fe65701 100644 --- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp @@ -23,14 +23,11 @@ class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramewo bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override { - static_assert(std::is_same_v); - - const nbl::video::CCUDAExportableMemory::SCreationParams params = { - .size = 4096, - .alignment = 4096, - .location = nbl::video::ECUDAMemoryLocation::DEVICE, - }; - return isAPILoaded() && params.location==nbl::video::ECUDAMemoryLocation::DEVICE; + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + return isAPILoaded(); } void workLoopBody() override {} diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index a78f710040..6dda3d275e 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -22,10 +22,10 @@ using namespace nbl::video; core::smart_refctd_ptr vulkanMemory, core::smart_refctd_ptr vulkanSemaphore) { - auto cudaMemory = cudaDevice.createExportableMemory({ + auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, { .size = 4096, .alignment = 4096, - .location = ECUDAMemoryLocation::DEVICE, + .location = CU_MEM_LOCATION_TYPE_DEVICE, }); if (!cudaMemory) return false; From 141790523f61caa5fbbf45223ba4cfa0bade78c9 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 11:58:20 +0200 Subject: [PATCH 099/149] Add CUDA interop runtime header discovery --- CMakeLists.txt | 1 + cmake/NablaCUDAInteropHelpers.cmake | 182 ++++++++++ cmake/NablaConfig.cmake.in | 3 + examples_tests | 2 +- include/nbl/ext/CUDAInterop/CCUDAHandler.h | 16 + src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 331 +++++++++++++++++- src/nbl/ext/CUDAInterop/CMakeLists.txt | 19 +- src/nbl/ext/CUDAInterop/README.md | 48 ++- src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 8 +- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 42 +++ 10 files changed, 641 insertions(+), 11 deletions(-) create mode 100644 cmake/NablaCUDAInteropHelpers.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 14845789fc..9251a3ee68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -312,6 +312,7 @@ if(NBL_ENABLE_CONFIG_INSTALL) set(_NBL_NABLA_CONFIG_FILES "${CMAKE_CURRENT_BINARY_DIR}/NablaConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/NablaConfigVersion.cmake" + "${CMAKE_CURRENT_LIST_DIR}/cmake/NablaCUDAInteropHelpers.cmake" ) install(EXPORT NablaExportTargets diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake new file mode 100644 index 0000000000..6486789aeb --- /dev/null +++ b/cmake/NablaCUDAInteropHelpers.cmake @@ -0,0 +1,182 @@ +function(_nbl_cuda_interop_collect_runtime_include_dirs _OUT_INCLUDE_DIRS) + set(_include_dirs ${ARGN}) + + if(DEFINED CUDAToolkit_INCLUDE_DIRS AND NOT "${CUDAToolkit_INCLUDE_DIRS}" STREQUAL "") + list(APPEND _include_dirs ${CUDAToolkit_INCLUDE_DIRS}) + endif() + + if(TARGET CUDA::toolkit) + get_target_property(_cuda_toolkit_include_dirs CUDA::toolkit INTERFACE_INCLUDE_DIRECTORIES) + if(_cuda_toolkit_include_dirs AND NOT _cuda_toolkit_include_dirs STREQUAL "NOTFOUND") + list(APPEND _include_dirs ${_cuda_toolkit_include_dirs}) + endif() + endif() + + if(_include_dirs) + list(REMOVE_DUPLICATES _include_dirs) + endif() + + set(${_OUT_INCLUDE_DIRS} ${_include_dirs} PARENT_SCOPE) +endfunction() + +function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT) + set(_include_dirs ${ARGN}) + set(_json "{\n \"cudaRuntimeIncludeDirs\": [") + set(_first ON) + + foreach(_include_dir IN LISTS _include_dirs) + if("${_include_dir}" STREQUAL "") + continue() + endif() + + file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json) + string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}") + + if(_first) + string(APPEND _json "\n") + set(_first OFF) + else() + string(APPEND _json ",\n") + endif() + string(APPEND _json " \"${_include_dir_json}\"") + endforeach() + + if(NOT _first) + string(APPEND _json "\n ]\n}\n") + else() + string(APPEND _json "]\n}\n") + endif() + + set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE) +endfunction() + +function(_nbl_cuda_interop_collect_configs _OUT_CONFIGS) + if(CMAKE_CONFIGURATION_TYPES) + set(_configs ${CMAKE_CONFIGURATION_TYPES}) + elseif(CMAKE_BUILD_TYPE) + set(_configs "${CMAKE_BUILD_TYPE}") + else() + set(_configs Debug) + endif() + + list(REMOVE_DUPLICATES _configs) + set(${_OUT_CONFIGS} ${_configs} PARENT_SCOPE) +endfunction() + +function(_nbl_cuda_interop_collect_target_runtime_jsons TARGET_NAME _OUT_FILES _OVERRIDE_OUTPUT) + _nbl_cuda_interop_collect_configs(_configs) + set(_runtime_jsons "") + + if(NOT "${_OVERRIDE_OUTPUT}" STREQUAL "") + foreach(_config IN LISTS _configs) + set(_runtime_paths_json "${_OVERRIDE_OUTPUT}") + string(REPLACE "$" "${_config}" _runtime_paths_json "${_runtime_paths_json}") + if(_runtime_paths_json MATCHES "\\$<") + message(FATAL_ERROR "Nabla: CUDA interop runtime JSON path supports only plain paths or $.") + endif() + cmake_path(IS_ABSOLUTE _runtime_paths_json _is_abs) + if(NOT _is_abs) + cmake_path(ABSOLUTE_PATH _runtime_paths_json BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_paths_json) + endif() + cmake_path(NORMAL_PATH _runtime_paths_json OUTPUT_VARIABLE _runtime_paths_json) + list(APPEND _runtime_jsons "${_runtime_paths_json}") + endforeach() + list(REMOVE_DUPLICATES _runtime_jsons) + set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE) + return() + endif() + + foreach(_config IN LISTS _configs) + string(TOUPPER "${_config}" _config_upper) + get_target_property(_runtime_output_dir "${TARGET_NAME}" "RUNTIME_OUTPUT_DIRECTORY_${_config_upper}") + + if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") + get_target_property(_runtime_output_dir "${TARGET_NAME}" RUNTIME_OUTPUT_DIRECTORY) + endif() + if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}) + set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}}") + endif() + if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY) + set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") + endif() + if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") + if(CMAKE_CONFIGURATION_TYPES) + set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}/${_config}") + else() + set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}") + endif() + endif() + + string(REPLACE "$" "${_config}" _runtime_output_dir "${_runtime_output_dir}") + if(_runtime_output_dir MATCHES "\\$<") + message(FATAL_ERROR "Nabla: nbl_configure_cuda_interop_runtime supports only plain runtime output directories or $.") + endif() + + cmake_path(IS_ABSOLUTE _runtime_output_dir _is_abs) + if(NOT _is_abs) + cmake_path(ABSOLUTE_PATH _runtime_output_dir BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_output_dir) + endif() + cmake_path(NORMAL_PATH _runtime_output_dir OUTPUT_VARIABLE _runtime_output_dir) + + list(APPEND _runtime_jsons "${_runtime_output_dir}/nbl_cuda_interop_runtime.json") + endforeach() + + list(REMOVE_DUPLICATES _runtime_jsons) + set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE) +endfunction() + +function(nbl_configure_cuda_interop_runtime TARGET_NAME) + cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN}) + + if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "Nabla: unexpected arguments for nbl_configure_cuda_interop_runtime: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}") + endif() + + if(NOT TARGET "${TARGET_NAME}") + message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist") + endif() + + _nbl_cuda_interop_collect_runtime_include_dirs(_include_dirs ${_NBL_CUDA_INTEROP_INCLUDE_DIRS}) + + _nbl_cuda_interop_make_runtime_paths_json(_runtime_paths_json_content ${_include_dirs}) + _nbl_cuda_interop_collect_target_runtime_jsons("${TARGET_NAME}" _runtime_paths_jsons "${_NBL_CUDA_INTEROP_RUNTIME_JSON}") + + foreach(_runtime_paths_json IN LISTS _runtime_paths_jsons) + file(GENERATE OUTPUT "${_runtime_paths_json}" CONTENT "${_runtime_paths_json_content}" TARGET "${TARGET_NAME}") + endforeach() + + set_source_files_properties(${_runtime_paths_jsons} PROPERTIES GENERATED TRUE HEADER_FILE_ONLY TRUE) + target_sources("${TARGET_NAME}" PRIVATE ${_runtime_paths_jsons}) +endfunction() + +function(nbl_target_link_cuda_interop TARGET_NAME) + set(_args ${ARGN}) + set(_scope PRIVATE) + + if(_args) + list(GET _args 0 _first_arg) + if(_first_arg MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$") + set(_scope "${_first_arg}") + list(REMOVE_AT _args 0) + endif() + endif() + + cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${_args}) + + if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "Nabla: unexpected arguments for nbl_target_link_cuda_interop: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}") + endif() + + if(NOT TARGET "${TARGET_NAME}") + message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist") + endif() + if(NOT TARGET Nabla::ext::CUDAInterop) + message(FATAL_ERROR "Nabla: Nabla::ext::CUDAInterop is not available. Request the CUDAInterop package component or enable NBL_COMPILE_WITH_CUDA.") + endif() + + target_link_libraries("${TARGET_NAME}" ${_scope} Nabla::ext::CUDAInterop) + nbl_configure_cuda_interop_runtime("${TARGET_NAME}" + RUNTIME_JSON "${_NBL_CUDA_INTEROP_RUNTIME_JSON}" + INCLUDE_DIRS ${_NBL_CUDA_INTEROP_INCLUDE_DIRS} + ) +endfunction() diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index 8b9f62e548..0464340ce3 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -96,6 +96,9 @@ if(_NBL_NABLA_LOAD_CUDA_INTEROP) _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) + if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + endif() endif() endif() diff --git a/examples_tests b/examples_tests index fbb82d36e0..b2c639c8b7 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit fbb82d36e0f767e867a477a9d1a7035c7cbd56ca +Subproject commit b2c639c8b71c3b860418dc4b3e46ad147ba5f256 diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h index 6a3cc6c496..bed4f9a31c 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h +++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h @@ -8,6 +8,7 @@ #include "nbl/core/definitions.h" #include "nbl/system/declarations.h" +#include "nbl/system/path.h" #include #include @@ -25,6 +26,21 @@ namespace cuda_native struct SAccess; } +namespace cuda_interop +{ +inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json"; + +struct SRuntimeCompileEnvironment +{ + core::vector includeDirs; + core::vector runtimePathFiles; +}; + +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs = {}); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles); +NBL_API2 core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment); +} + class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index 777a1db14a..fce7fd2b5a 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -3,6 +3,324 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/system/ModuleLookupUtils.h" + +#include +#include +#include +#include +#include +#include + +namespace nbl::video::cuda_interop +{ +namespace +{ + +std::string readEnvironmentVariable(std::string_view name) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + char* value = nullptr; + size_t size = 0; + if (_dupenv_s(&value,&size,std::string(name).c_str()) || !value) + return {}; + std::string result(value); + std::free(value); + return result; + #else + if (const char* value = std::getenv(std::string(name).c_str())) + return value; + return {}; + #endif +} + +bool isDirectory(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_directory(path,error); +} + +bool isRegularFile(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_regular_file(path,error); +} + +system::path normalizedAbsolute(system::path path) +{ + std::error_code error; + auto absolute = std::filesystem::absolute(path,error); + if (error) + absolute = std::move(path); + return absolute.lexically_normal(); +} + +bool looksLikeCUDAIncludeDir(const system::path& path) +{ + if (!isDirectory(path)) + return false; + + return isRegularFile(path/"cuda_fp16.h") || + isRegularFile(path/"cuda_runtime_api.h") || + isRegularFile(path/"vector_types.h") || + isRegularFile(path/"cuda.h") || + isRegularFile(path/"nv"/"target"); +} + +void appendIncludeDir(core::vector& includeDirs, system::path path) +{ + if (path.empty() || !looksLikeCUDAIncludeDir(path)) + return; + + path = normalizedAbsolute(std::move(path)); + const auto pathString = path.generic_string(); + const auto alreadyAdded = std::find_if(includeDirs.begin(),includeDirs.end(),[&](const system::path& existing) { + return existing.generic_string()==pathString; + }); + if (alreadyAdded==includeDirs.end()) + includeDirs.push_back(std::move(path)); +} + +void appendCUDAIncludeRoot(core::vector& includeDirs, const system::path& root) +{ + if (root.empty()) + return; + + appendIncludeDir(includeDirs,root); + appendIncludeDir(includeDirs,root/"include"); +} + +core::vector parseStringArray(std::string_view text, std::string_view key) +{ + core::vector values; + const std::string quotedKey = "\"" + std::string(key) + "\""; + const auto keyPos = text.find(quotedKey); + if (keyPos==std::string_view::npos) + return values; + + const auto arrayBegin = text.find('[',keyPos+quotedKey.size()); + if (arrayBegin==std::string_view::npos) + return values; + const auto arrayEnd = text.find(']',arrayBegin+1); + if (arrayEnd==std::string_view::npos) + return values; + + for (auto pos = arrayBegin+1; pos=arrayEnd) + break; + + std::string value; + auto cursor = quoteBegin+1; + for (; cursor& includeDirs, const system::path& configFile) +{ + if (!isRegularFile(configFile)) + return; + + std::ifstream input(configFile); + if (!input) + return; + + std::stringstream buffer; + buffer << input.rdbuf(); + for (const auto& path : parseStringArray(buffer.str(),"cudaRuntimeIncludeDirs")) + appendIncludeDir(includeDirs,system::path(path)); +} + +void appendRuntimePathsConfigEnv(core::vector& includeDirs, std::string_view name) +{ + const auto value = readEnvironmentVariable(name); + if (value.empty()) + return; + + #if defined(_NBL_PLATFORM_WINDOWS_) + constexpr char Separator = ';'; + #else + constexpr char Separator = ':'; + #endif + + size_t begin = 0; + while (begin& includeDirs, const core::vector& explicitRuntimePathFiles) +{ + for (const auto& runtimePathFile : explicitRuntimePathFiles) + appendRuntimePathsConfig(includeDirs,runtimePathFile); + + appendRuntimePathsConfigEnv(includeDirs,"NBL_CUDA_INTEROP_RUNTIME_JSON"); + appendRuntimePathsConfigEnv(includeDirs,"Nabla_CUDA_INTEROP_RUNTIME_JSON"); + + const auto exeDir = system::executableDirectory(); + if (!exeDir.empty()) + appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName); + + #if defined(_NBL_PLATFORM_WINDOWS_) + const auto releaseModuleDir = system::loadedModuleDirectory("Nabla.dll"); + if (!releaseModuleDir.empty()) + appendRuntimePathsConfig(includeDirs,releaseModuleDir/RuntimePathsFileName); + const auto debugModuleDir = system::loadedModuleDirectory("Nabla_debug.dll"); + if (!debugModuleDir.empty()) + appendRuntimePathsConfig(includeDirs,debugModuleDir/RuntimePathsFileName); + #endif +} + +void appendAppLocalIncludeDirs(core::vector& includeDirs) +{ + const auto exeDir = system::executableDirectory(); + if (exeDir.empty()) + return; + + appendIncludeDir(includeDirs,exeDir/"cuda"/"include"); + appendIncludeDir(includeDirs,exeDir/"nvidia"/"cu13"/"include"); + appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include"); + appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include"); +} + +void appendPythonPackageIncludeDirs(core::vector& includeDirs, const system::path& root) +{ + if (root.empty()) + return; + + appendIncludeDir(includeDirs,root/"Lib"/"site-packages"/"nvidia"/"cu13"/"include"); + appendIncludeDir(includeDirs,root/"lib"/"site-packages"/"nvidia"/"cu13"/"include"); + appendIncludeDir(includeDirs,root/"Library"/"include"); + appendIncludeDir(includeDirs,root/"include"); +} + +void appendPathListEnv(core::vector& includeDirs, std::string_view name) +{ + const auto value = readEnvironmentVariable(name); + if (value.empty()) + return; + + #if defined(_NBL_PLATFORM_WINDOWS_) + constexpr char Separator = ';'; + #else + constexpr char Separator = ':'; + #endif + + size_t begin = 0; + while (begin& includeDirs) +{ + appendPathListEnv(includeDirs,"NBL_CUDA_RUNTIME_INCLUDE_DIRS"); + appendPathListEnv(includeDirs,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS"); + + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH")); + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME")); + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_ROOT")); + appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDAToolkit_ROOT")); + + appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("VIRTUAL_ENV")); + appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("CONDA_PREFIX")); +} + +void appendCUDAInstallRoots(core::vector& includeDirs, const system::path& root) +{ + if (!isDirectory(root)) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + if (!entry.is_directory(error)) + continue; + candidates.push_back(entry.path()/"include"); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendIncludeDir(includeDirs,candidate); +} + +void appendSystemIncludeDirs(core::vector& includeDirs) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + appendCUDAInstallRoots(includeDirs,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA"); + #else + appendIncludeDir(includeDirs,"/usr/local/cuda/include"); + appendCUDAInstallRoots(includeDirs,"/usr/local"); + appendIncludeDir(includeDirs,"/usr/include"); + #endif +} + +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles) +{ + SRuntimeCompileEnvironment environment; + environment.runtimePathFiles = std::move(runtimePathFiles); + for (auto& includeDir : explicitIncludeDirs) + appendIncludeDir(environment.includeDirs,std::move(includeDir)); + + appendRuntimePathsConfigs(environment.includeDirs,environment.runtimePathFiles); + appendAppLocalIncludeDirs(environment.includeDirs); + appendEnvironmentIncludeDirs(environment.includeDirs); + appendSystemIncludeDirs(environment.includeDirs); + + return environment; +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs) +{ + return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{}); +} + +core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment) +{ + core::vector options; + for (const auto& includeDir : environment.includeDirs) + options.push_back("-I" + includeDir.generic_string()); + return options; +} + +} #ifdef _NBL_COMPILE_WITH_CUDA_ #include "CUDAInteropNativeState.hpp" @@ -671,7 +989,18 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv if (result!=NVRTC_SUCCESS) return {nullptr,result}; - result = compileProgram(handler,program,nvrtcOptions); + const auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment(); + const auto runtimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment); + core::vector options; + options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size()); + for (const auto option : nvrtcOptions) + options.push_back(option); + for (const auto& option : runtimeIncludeOptions) + options.push_back(option.c_str()); + + const auto* optionsBegin = options.empty() ? nullptr:options.data(); + const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); + result = compileProgram(handler,program,{optionsBegin,optionsEnd}); if (log) getProgramLog(handler,program,*log); if (result!=NVRTC_SUCCESS) diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt index 438ab51d8f..a9e1663fa9 100644 --- a/src/nbl/ext/CUDAInterop/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -1,13 +1,22 @@ -include(${NBL_ROOT_PATH}/cmake/common.cmake) +include(common) +include(NablaCUDAInteropHelpers) if (NBL_COMPILE_WITH_CUDA) set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP") - add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE) + file(GLOB NBL_EXT_CUDA_INTEROP_IDE_HEADERS CONFIGURE_DEPENDS "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop/*.h") + set(NBL_EXT_CUDA_INTEROP_IDE_SOURCES + ${NBL_EXT_CUDA_INTEROP_IDE_HEADERS} + CMakeLists.txt + README.md + ) + set_source_files_properties(${NBL_EXT_CUDA_INTEROP_IDE_SOURCES} PROPERTIES HEADER_FILE_ONLY TRUE) + + # Header-only opt-in target. It builds no artifact and adds CUDA SDK usage requirements only for native interop consumers. + add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE EXCLUDE_FROM_ALL ${NBL_EXT_CUDA_INTEROP_IDE_SOURCES}) target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE - $ - $ - $ + Nabla + CUDA::toolkit ) set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop") add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB}) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index cf3a89cdd1..837f3ab28e 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -2,8 +2,12 @@ - `Nabla::Nabla` owns the CUDA interop implementation and exported symbols. - `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`. +- The SDK-free interop headers stay stable for CUDA ON and CUDA OFF Nabla builds. - `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target. +- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It does not build a library or executable artifact. +- The target only carries usage requirements and IDE-visible sources. - `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`. +- `CUDAInteropNative.h` is the small opt-in header that includes CUDA SDK headers such as `cuda.h` and `nvrtc.h`. - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=` when requesting `CUDAInterop`. - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`. @@ -16,12 +20,28 @@ target_link_libraries(app PRIVATE Nabla::Nabla) ```cmake find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) -target_link_libraries(native_app PRIVATE Nabla::ext::CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE) +``` + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE + INCLUDE_DIRS "${cuda_runtime_headers}" +) +``` + +```cmake +nbl_target_link_cuda_interop(native_app PRIVATE + RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$/my_cuda_runtime.json" +) ``` ```cpp #include "nbl/ext/CUDAInterop/CUDAInteropNative.h" +auto runtimeEnv = nbl::video::cuda_interop::findRuntimeCompileEnvironment(); +auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnv); + auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, { .size = size, .alignment = alignment, @@ -29,6 +49,23 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, { }); ``` +## Runtime Header Discovery + +- `nbl_target_link_cuda_interop( )` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target. +- The helper is defined once in `NablaCUDAInteropHelpers.cmake` and is available from the source tree and installed `NablaConfig.cmake`. +- For each target it writes `nbl_cuda_interop_runtime.json` next to the executable during CMake generation. +- `RUNTIME_JSON ` overrides the generated JSON location. Plain paths and `$` are supported. +- `cuda_interop::findRuntimeCompileEnvironment` can also receive explicit JSON paths at runtime. +- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. +- The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths. +- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. +- Runtime lookup reads `nbl_cuda_interop_runtime.json` first, then checks app-local include bundles, explicit environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. +- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list. +- Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. +- If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit. +- `CUDA_PATH` is a developer fallback. It is not required for packaged applications. +- Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON. + ## Properties - Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers. @@ -38,12 +75,17 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, { - The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs. - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. - `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor. +- Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla. +- Native consumers can use a newer compatible CUDA SDK or a runtime/header package without rebuilding Nabla. +- Toggling Nabla CUDA support does not change SDK-free public header parse requirements for consumers. - The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds. -- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. -- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`, so CUDA OFF builds do not need `cuda.h` or `nvrtc.h`. +- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`. ## Related Designs +This split follows the same public-boundary pattern used by mature GPU projects: SDK-free default headers, native access through an explicit opt-in path, and SDK-dependent implementation details outside the default public API. + - OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79). - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61). - Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27). diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index bdda95fb03..7118eeff09 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.30) project(NblExtCUDAInteropSmoke CXX) option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF) +set(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON "" CACHE FILEPATH "Optional CUDA interop runtime JSON path used by the native smoke.") if(NOT TARGET Nabla::Nabla) set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core) @@ -28,5 +29,10 @@ target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla) if(TARGET Nabla::ext::CUDAInterop) nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp) - target_link_libraries(NblExtCUDAInteropNativeOptInSmoke PRIVATE Nabla::ext::CUDAInterop) + set(_nbl_cuda_interop_smoke_args PRIVATE) + if(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON) + list(APPEND _nbl_cuda_interop_smoke_args RUNTIME_JSON "${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + target_compile_definitions(NblExtCUDAInteropNativeOptInSmoke PRIVATE NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON="${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + endif() + nbl_target_link_cuda_interop(NblExtCUDAInteropNativeOptInSmoke ${_nbl_cuda_interop_smoke_args}) endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 6dda3d275e..3b799a56cf 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,30 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) releaseContext(); return ok && std::ranges::equal(input, output); } + +bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) +{ + constexpr const char* Source = R"cuda( + #include + extern "C" __global__ void fp16_probe(unsigned short* out) + { + out[0] = sizeof(__half); + } + )cuda"; + + std::string log; + auto [ptx, result] = cuda_native::compileDirectlyToPTX( + handler, + Source, + "cuda_fp16_discovery_probe.cu", + {nullptr,nullptr}, + 0, + nullptr, + nullptr, + &log + ); + return result==NVRTC_SUCCESS && ptx && ptx->getSize()>0u; +} } class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework @@ -98,10 +123,27 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew static_assert(std::is_same_v())), CUdevice>); + #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}); + if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON)) + return false; + #else + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(); + #endif + const auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment); + const auto hasRuntimeHeaders = std::find_if(runtimeEnvironment.includeDirs.begin(),runtimeEnvironment.includeDirs.end(),[](const auto& includeDir) { + return std::filesystem::exists(includeDir/"cuda_fp16.h") || std::filesystem::exists(includeDir/"cuda_runtime_api.h"); + })!=runtimeEnvironment.includeDirs.end(); + if (includeOptions.empty() || !hasRuntimeHeaders) + return false; + auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); if (!handler) return true; + if (!cudaFp16HeaderCompileProbe(*handler)) + return false; + const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler); if (devices.empty()) return true; From 045432e616810403aa55d1232cd57fbbcc6dc8d1 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 12:47:01 +0200 Subject: [PATCH 100/149] Tighten CUDA interop native helpers --- cmake/NablaCUDAInteropHelpers.cmake | 30 +-- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 255 ++++++++---------- src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 81 ++---- src/nbl/ext/CUDAInterop/README.md | 27 ++ 4 files changed, 178 insertions(+), 215 deletions(-) diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake index 6486789aeb..9c1ac657d4 100644 --- a/cmake/NablaCUDAInteropHelpers.cmake +++ b/cmake/NablaCUDAInteropHelpers.cmake @@ -21,8 +21,7 @@ endfunction() function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT) set(_include_dirs ${ARGN}) - set(_json "{\n \"cudaRuntimeIncludeDirs\": [") - set(_first ON) + set(_cuda_runtime_include_dir_entries "") foreach(_include_dir IN LISTS _include_dirs) if("${_include_dir}" STREQUAL "") @@ -32,21 +31,22 @@ function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT) file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json) string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}") - if(_first) - string(APPEND _json "\n") - set(_first OFF) - else() - string(APPEND _json ",\n") - endif() - string(APPEND _json " \"${_include_dir_json}\"") + list(APPEND _cuda_runtime_include_dir_entries " \"${_include_dir_json}\"") endforeach() - if(NOT _first) - string(APPEND _json "\n ]\n}\n") - else() - string(APPEND _json "]\n}\n") - endif() - + set(_json_entry_separator [=[ +, +]=]) + list(JOIN _cuda_runtime_include_dir_entries "${_json_entry_separator}" _cuda_runtime_include_dirs) + + set(_json [=[ +{ + "cudaRuntimeIncludeDirs": [ +@_cuda_runtime_include_dirs@ + ] +} +]=]) + string(CONFIGURE "${_json}" _json @ONLY) set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE) endfunction() diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index dd87d93e43..6833ad8189 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -9,6 +9,11 @@ #include "nbl/asset/ICPUBuffer.h" #include "nbl/system/DynamicFunctionCaller.h" +#include +#include +#include +#include + #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 13000 @@ -153,27 +158,64 @@ struct SExportableMemoryCreationParams CUmemLocationType location; }; -NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); -NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); +namespace detail +{ + +template +struct is_smart_refctd_ptr : std::false_type {}; + +template +struct is_smart_refctd_ptr> : std::true_type {}; + +template +inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr>::value; -inline const CUDA& getCUDAFunctionTable(const CCUDAHandler* handler) +template +inline constexpr bool is_indirect_object_v = std::is_pointer_v> || is_smart_refctd_ptr_v; + +template +decltype(auto) as_ref(Object&& object) { - return getCUDAFunctionTable(*handler); + using object_t = std::remove_cvref_t; + if constexpr (std::is_pointer_v) + return *object; + else if constexpr (is_smart_refctd_ptr_v) + return *object; + else + return std::forward(object); } -inline const CUDA& getCUDAFunctionTable(const core::smart_refctd_ptr& handler) -{ - return getCUDAFunctionTable(*handler); +template +concept object_like = is_indirect_object_v && requires(Object&& object) { + { as_ref(std::forward(object)) } -> std::convertible_to; +}; + +template +concept const_object_like = is_indirect_object_v && requires(Object&& object) { + { as_ref(std::forward(object)) } -> std::convertible_to; +}; + +template +concept program_text_source = std::same_as, std::string> || + std::convertible_to; + } -inline const NVRTC& getNVRTCFunctionTable(const CCUDAHandler* handler) +NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); +NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); + +template +requires detail::const_object_like +inline const CUDA& getCUDAFunctionTable(Handler&& handler) { - return getNVRTCFunctionTable(*handler); + return getCUDAFunctionTable(detail::as_ref(std::forward(handler))); } -inline const NVRTC& getNVRTCFunctionTable(const core::smart_refctd_ptr& handler) +template +requires detail::const_object_like +inline const NVRTC& getNVRTCFunctionTable(Handler&& handler) { - return getNVRTCFunctionTable(*handler); + return getNVRTCFunctionTable(detail::as_ref(std::forward(handler))); } NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); @@ -185,14 +227,11 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } NBL_API2 const core::vector& getAvailableDevices(const CCUDAHandler& handler); -inline const core::vector& getAvailableDevices(const CCUDAHandler* handler) -{ - return getAvailableDevices(*handler); -} - -inline const core::vector& getAvailableDevices(const core::smart_refctd_ptr& handler) +template +requires detail::const_object_like +inline const core::vector& getAvailableDevices(Handler&& handler) { - return getAvailableDevices(*handler); + return getAvailableDevices(detail::as_ref(std::forward(handler))); } NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); @@ -201,29 +240,26 @@ inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, cons return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames); } NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); -inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) -{ - return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames); -} -inline nvrtcResult createProgram(const core::smart_refctd_ptr& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) -{ - return createProgram(*handler,prog,std::move(source),name,headerCount,headerContents,includeNames); -} -inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) -{ - return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames); -} -inline nvrtcResult createProgram(const core::smart_refctd_ptr& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) -{ - return createProgram(*handler,prog,source,name,headerCount,headerContents,includeNames); -} -inline nvrtcResult createProgram(CCUDAHandler* handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) + +template +requires detail::object_like && detail::program_text_source +inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { - return createProgram(*handler,prog,file,headerCount,headerContents,includeNames); + auto& handlerRef = detail::as_ref(std::forward(handler)); + if constexpr (std::same_as, std::string>) + return createProgram(handlerRef,prog,std::string(std::forward(source)),name,headerCount,headerContents,includeNames); + else + { + const char* sourceText = source; + return createProgram(handlerRef,prog,sourceText,name,headerCount,headerContents,includeNames); + } } -inline nvrtcResult createProgram(const core::smart_refctd_ptr& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) + +template +requires detail::object_like && std::convertible_to +inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { - return createProgram(*handler,prog,file,headerCount,headerContents,includeNames); + return createProgram(detail::as_ref(std::forward(handler)),prog,static_cast(file),headerCount,headerContents,includeNames); } NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); @@ -253,53 +289,34 @@ NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX( const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr ); + +template +requires detail::object_like && detail::program_text_source inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - CCUDAHandler* handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -) -{ - return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); -} -inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - const core::smart_refctd_ptr& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -) -{ - return compileDirectlyToPTX(*handler,std::move(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); -} -inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - CCUDAHandler* handler, const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -) -{ - return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log); -} -inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - const core::smart_refctd_ptr& handler, const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -) -{ - return compileDirectlyToPTX(*handler,source,filename,nvrtcOptions,headerCount,headerContents,includeNames,log); -} -inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - CCUDAHandler* handler, system::IFile* file, core::SRange nvrtcOptions, + Handler&& handler, Source&& source, const char* filename, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr ) { - return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log); + auto& handlerRef = detail::as_ref(std::forward(handler)); + if constexpr (std::same_as, std::string>) + return compileDirectlyToPTX(handlerRef,std::string(std::forward(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); + else + { + const char* sourceText = source; + return compileDirectlyToPTX(handlerRef,sourceText,filename,nvrtcOptions,headerCount,headerContents,includeNames,log); + } } + +template +requires detail::object_like && std::convertible_to inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - const core::smart_refctd_ptr& handler, system::IFile* file, core::SRange nvrtcOptions, + Handler&& handler, File file, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr ) { - return compileDirectlyToPTX(*handler,file,nvrtcOptions,headerCount,headerContents,includeNames,log); + return compileDirectlyToPTX(detail::as_ref(std::forward(handler)),static_cast(file),nvrtcOptions,headerCount,headerContents,includeNames,log); } NBL_API2 CUdevice getInternalObject(const CCUDADevice& device); @@ -311,84 +328,50 @@ NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); -inline CUdevice getInternalObject(const CCUDADevice* device) -{ - return getInternalObject(*device); -} - -inline CUdevice getInternalObject(const core::smart_refctd_ptr& device) -{ - return getInternalObject(*device); -} - -inline CUcontext getContext(const CCUDADevice* device) -{ - return getContext(*device); -} - -inline CUcontext getContext(const core::smart_refctd_ptr& device) -{ - return getContext(*device); -} - -inline size_t roundToGranularity(const CCUDADevice* device, CUmemLocationType location, size_t size) -{ - return roundToGranularity(*device,location,size); -} - -inline size_t roundToGranularity(const core::smart_refctd_ptr& device, CUmemLocationType location, size_t size) -{ - return roundToGranularity(*device,location,size); -} - -inline core::smart_refctd_ptr createExportableMemory(CCUDADevice* device, SExportableMemoryCreationParams&& params) -{ - return createExportableMemory(*device,std::move(params)); -} - -inline core::smart_refctd_ptr createExportableMemory(const core::smart_refctd_ptr& device, SExportableMemoryCreationParams&& params) -{ - return createExportableMemory(*device,std::move(params)); -} - -inline CUdeviceptr getDeviceptr(const CCUDAExportableMemory* memory) -{ - return getDeviceptr(*memory); -} - -inline CUdeviceptr getDeviceptr(const core::smart_refctd_ptr& memory) -{ - return getDeviceptr(*memory); -} - -inline CUexternalMemory getInternalObject(const CCUDAImportedMemory* memory) +template +requires ( + detail::const_object_like || + detail::const_object_like || + detail::const_object_like +) +inline auto getInternalObject(Object&& object) { - return getInternalObject(*memory); + return getInternalObject(detail::as_ref(std::forward(object))); } -inline CUexternalMemory getInternalObject(const core::smart_refctd_ptr& memory) +template +requires detail::const_object_like +inline CUcontext getContext(Device&& device) { - return getInternalObject(*memory); + return getContext(detail::as_ref(std::forward(device))); } -inline CUresult getMappedBuffer(const CCUDAImportedMemory* memory, CUdeviceptr* mappedBuffer) +template +requires detail::const_object_like +inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size) { - return getMappedBuffer(*memory,mappedBuffer); + return roundToGranularity(detail::as_ref(std::forward(device)),location,size); } -inline CUresult getMappedBuffer(const core::smart_refctd_ptr& memory, CUdeviceptr* mappedBuffer) +template +requires detail::object_like +inline core::smart_refctd_ptr createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params) { - return getMappedBuffer(*memory,mappedBuffer); + return createExportableMemory(detail::as_ref(std::forward(device)),std::move(params)); } -inline CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore* semaphore) +template +requires detail::const_object_like +inline CUdeviceptr getDeviceptr(Memory&& memory) { - return getInternalObject(*semaphore); + return getDeviceptr(detail::as_ref(std::forward(memory))); } -inline CUexternalSemaphore getInternalObject(const core::smart_refctd_ptr& semaphore) +template +requires detail::const_object_like +inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer) { - return getInternalObject(*semaphore); + return getMappedBuffer(detail::as_ref(std::forward(memory)),mappedBuffer); } } diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index fce7fd2b5a..13046d6d1e 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -5,11 +5,11 @@ #include "nbl/ext/CUDAInterop/CUDAInterop.h" #include "nbl/system/ModuleLookupUtils.h" +#include "nlohmann/json.hpp" + #include #include #include -#include -#include #include namespace nbl::video::cuda_interop @@ -17,21 +17,11 @@ namespace nbl::video::cuda_interop namespace { -std::string readEnvironmentVariable(std::string_view name) +std::string readEnvironmentVariable(const char* name) { - #if defined(_NBL_PLATFORM_WINDOWS_) - char* value = nullptr; - size_t size = 0; - if (_dupenv_s(&value,&size,std::string(name).c_str()) || !value) - return {}; - std::string result(value); - std::free(value); - return result; - #else - if (const char* value = std::getenv(std::string(name).c_str())) + if (const char* value = std::getenv(name)) return value; return {}; - #endif } bool isDirectory(const system::path& path) @@ -90,50 +80,6 @@ void appendCUDAIncludeRoot(core::vector& includeDirs, const system appendIncludeDir(includeDirs,root/"include"); } -core::vector parseStringArray(std::string_view text, std::string_view key) -{ - core::vector values; - const std::string quotedKey = "\"" + std::string(key) + "\""; - const auto keyPos = text.find(quotedKey); - if (keyPos==std::string_view::npos) - return values; - - const auto arrayBegin = text.find('[',keyPos+quotedKey.size()); - if (arrayBegin==std::string_view::npos) - return values; - const auto arrayEnd = text.find(']',arrayBegin+1); - if (arrayEnd==std::string_view::npos) - return values; - - for (auto pos = arrayBegin+1; pos=arrayEnd) - break; - - std::string value; - auto cursor = quoteBegin+1; - for (; cursor& includeDirs, const system::path& configFile) { if (!isRegularFile(configFile)) @@ -143,13 +89,20 @@ void appendRuntimePathsConfig(core::vector& includeDirs, const sys if (!input) return; - std::stringstream buffer; - buffer << input.rdbuf(); - for (const auto& path : parseStringArray(buffer.str(),"cudaRuntimeIncludeDirs")) - appendIncludeDir(includeDirs,system::path(path)); + const auto json = nlohmann::json::parse(input,nullptr,false); + if (json.is_discarded()) + return; + + const auto paths = json.find("cudaRuntimeIncludeDirs"); + if (paths==json.end() || !paths->is_array()) + return; + + for (const auto& path : *paths) + if (path.is_string()) + appendIncludeDir(includeDirs,system::path(path.get())); } -void appendRuntimePathsConfigEnv(core::vector& includeDirs, std::string_view name) +void appendRuntimePathsConfigEnv(core::vector& includeDirs, const char* name) { const auto value = readEnvironmentVariable(name); if (value.empty()) @@ -218,7 +171,7 @@ void appendPythonPackageIncludeDirs(core::vector& includeDirs, con appendIncludeDir(includeDirs,root/"include"); } -void appendPathListEnv(core::vector& includeDirs, std::string_view name) +void appendPathListEnv(core::vector& includeDirs, const char* name) { const auto value = readEnvironmentVariable(name); if (value.empty()) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 837f3ab28e..c75300016e 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -66,6 +66,33 @@ auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, { - `CUDA_PATH` is a developer fallback. It is not required for packaged applications. - Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON. +## Runtime Header Distribution + +Nabla packages do not ship CUDA runtime headers. That is a packaging choice, not a hard legal requirement for applications that need NVRTC runtime compilation. + +NVIDIA CUDA EULA limits CUDA redistribution to selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A then lists the CUDA Toolkit files that may be redistributed with applications. See: + +- https://docs.nvidia.com/cuda/eula/#distribution +- https://docs.nvidia.com/cuda/eula/#attachment-a + +Relevant Attachment A header entries include: + +- `nvrtc.h` under `NVIDIA Runtime Compilation Library and Header`. +- `cuda_occupancy.h` under `CUDA Occupancy Calculation Header Library`. +- `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` under `CUDA Floating Point Type Headers`. +- `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, `vector_types.h` under `CUDA Headers for Runtime Compilation`. + +CuPy documents the same runtime-compile problem. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They also show the common `vector_types.h` failure and recommend `nvidia-cuda-runtime-cu12` for PyPI installs or `cuda-cudart-dev` from system packages: + +- https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6 +- https://github.com/cupy/cupy/issues/8466 + +For Nabla consumers this means: + +- The default Nabla package stays SDK-free for consumers that only link `Nabla::Nabla`. +- Native interop consumers can install CUDA runtime headers through an official package, point `NBL_CUDA_INTEROP_RUNTIME_JSON` at their own JSON, pass `INCLUDE_DIRS` to `nbl_target_link_cuda_interop`, or ship an app-local header bundle if their distribution model allows it. +- Shipping such headers is a consumer packaging decision. Nabla runtime discovery supports it, but Nabla does not install host-specific CUDA header paths or redistribute CUDA headers by default. + ## Properties - Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers. From 8a119dda501a7f9c6f979ee7e6d98e6840c04d35 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 13:24:10 +0200 Subject: [PATCH 101/149] Hide CUDA interop native state construction --- include/nbl/ext/CUDAInterop/CCUDADevice.h | 7 ++-- .../ext/CUDAInterop/CCUDAExportableMemory.h | 6 ++- include/nbl/ext/CUDAInterop/CCUDAHandler.h | 6 +-- .../nbl/ext/CUDAInterop/CCUDAImportedMemory.h | 7 ++-- .../ext/CUDAInterop/CCUDAImportedSemaphore.h | 6 ++- src/nbl/ext/CUDAInterop/CCUDADevice.cpp | 22 ++++++++--- .../ext/CUDAInterop/CCUDAExportableMemory.cpp | 24 +++++++++++- src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 16 ++++++-- .../ext/CUDAInterop/CCUDAImportedMemory.cpp | 8 +++- .../CUDAInterop/CCUDAImportedSemaphore.cpp | 8 +++- .../CUDAInterop/CUDAInteropNativeState.hpp | 9 +++++ src/nbl/ext/CUDAInterop/README.md | 38 +++++++++++++++++-- 12 files changed, 126 insertions(+), 31 deletions(-) diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/ext/CUDAInterop/CCUDADevice.h index 12465f40f4..94eb450802 100644 --- a/include/nbl/ext/CUDAInterop/CCUDADevice.h +++ b/include/nbl/ext/CUDAInterop/CCUDADevice.h @@ -25,7 +25,6 @@ struct SAccess; class NBL_API2 CCUDADevice : public core::IReferenceCounted { public: - struct SNativeState; #ifdef _WIN32 static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; #else @@ -68,8 +67,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted }; inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} - CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); - ~CCUDADevice() override; inline core::SRange geDefaultCompileOptions() const @@ -86,8 +83,12 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); private: + friend class CCUDAHandler; friend struct cuda_native::SAccess; + struct SNativeState; + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); + static constexpr auto CudaMemoryLocationCount = 5; const system::logger_opt_ptr m_logger; diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h index 80a9b3630a..6d29739408 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h @@ -21,7 +21,6 @@ struct SAccess; class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { public: - struct SNativeState; struct SCachedCreationParams { size_t size; @@ -31,7 +30,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted bool deviceLocal; }; - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); ~CCUDAExportableMemory() override; core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; @@ -39,6 +37,10 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted private: friend struct cuda_native::SAccess; + struct SNativeState; + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + static core::smart_refctd_ptr create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + core::smart_refctd_ptr m_device; SCachedCreationParams m_params; std::unique_ptr m_native; diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/ext/CUDAInterop/CCUDAHandler.h index bed4f9a31c..f6b5d578a8 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAHandler.h +++ b/include/nbl/ext/CUDAInterop/CCUDAHandler.h @@ -44,11 +44,8 @@ NBL_API2 core::vector makeNVRTCIncludeOptions(const SRuntimeCompile class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: - struct SNativeState; static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); - CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - inline core::SRange getSTDHeaders() { auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get()); @@ -75,6 +72,9 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted private: friend struct cuda_native::SAccess; + struct SNativeState; + CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + std::unique_ptr m_native; core::vector m_availableDevices; core::vector> m_headers; diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h index adb803f12c..87f804ce76 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h @@ -19,14 +19,15 @@ struct SAccess; class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted { public: - struct SNativeState; - CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); - ~CCUDAImportedMemory() override; private: + friend class CCUDADevice; friend struct cuda_native::SAccess; + struct SNativeState; + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_src; std::unique_ptr m_native; diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h index 894f2444c0..c8bf77313e 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h +++ b/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h @@ -22,13 +22,15 @@ struct SAccess; class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { public: - struct SNativeState; - CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); ~CCUDAImportedSemaphore() override; private: + friend class CCUDADevice; friend struct cuda_native::SAccess; + struct SNativeState; + CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_src; std::unique_ptr m_native; diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp index ebac00b7b4..8e696d0827 100644 --- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDADevice.cpp @@ -27,6 +27,8 @@ CCUDADevice::CCUDADevice( m_handler(std::move(handler)), m_native(std::move(nativeState)) { + assert(m_native); + m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); @@ -150,7 +152,7 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice #endif }; - auto nativeState = std::make_unique(); + auto nativeState = SAccess::makeExportableMemoryNativeState(); CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) @@ -166,7 +168,7 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice return nullptr; } - if (const auto err = reserveAddressAndMapMemory(device,&nativeState->ptr, params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(device,&SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err) { logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); @@ -185,7 +187,7 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(&device), std::move(params), std::move(nativeState)); + return SAccess::makeExportableMemory(core::smart_refctd_ptr(&device),std::move(params),std::move(nativeState)); } } @@ -215,7 +217,10 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), std::make_unique(cuExtMem)); + return core::smart_refctd_ptr( + new CCUDAImportedMemory(core::smart_refctd_ptr(this),std::move(mem),std::make_unique(cuExtMem)), + core::dont_grab + ); } core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) @@ -245,7 +250,10 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), std::make_unique(cusema)); + return core::smart_refctd_ptr( + new CCUDAImportedSemaphore(core::smart_refctd_ptr(this),std::move(sema),std::make_unique(cusema)), + core::dont_grab + ); } CCUDADevice::~CCUDADevice() @@ -275,7 +283,9 @@ CCUDADevice::CCUDADevice( , m_virtualArchitecture(virtualArchitecture) , m_handler(std::move(handler)) , m_native(std::move(nativeState)) -{} +{ + assert(m_native); +} CCUDADevice::~CCUDADevice() = default; diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp index a65d1b680c..7d5483af04 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp @@ -14,7 +14,17 @@ CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr : m_device(std::move(device)) , m_params(std::move(params)) , m_native(std::move(nativeState)) -{} +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { @@ -76,7 +86,17 @@ CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr : m_device(std::move(device)) , m_params(std::move(params)) , m_native(std::move(nativeState)) -{} +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} CCUDAExportableMemory::~CCUDAExportableMemory() = default; diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index 13046d6d1e..229a27cfac 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -309,6 +309,8 @@ CCUDAHandler::CCUDAHandler( , m_logger(std::move(_logger)) , m_version(_version) { + assert(m_native); + for (auto& header : m_headers) { m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); @@ -858,7 +860,10 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste )); } - return core::make_smart_refctd_ptr(std::make_unique(std::move(cuda),std::move(nvrtc)), std::move(headers), std::move(_logger), cudaVersion); + return core::smart_refctd_ptr( + new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger),cudaVersion), + core::dont_grab + ); } namespace cuda_native @@ -1090,7 +1095,10 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, std::make_unique(device.handle), core::smart_refctd_ptr(this)); + return core::smart_refctd_ptr( + new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.handle),core::smart_refctd_ptr(this)), + core::dont_grab + ); } } return nullptr; @@ -1115,7 +1123,9 @@ CCUDAHandler::CCUDAHandler( , m_headers(std::move(_headers)) , m_logger(std::move(_logger)) , m_version(_version) -{} +{ + assert(m_native); +} CCUDAHandler::~CCUDAHandler() = default; diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp index 8de3ce3e63..3a8ed56371 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp @@ -14,7 +14,9 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev : m_device(std::move(device)) , m_src(std::move(src)) , m_native(std::move(nativeState)) -{} +{ + assert(m_native); +} namespace cuda_native { @@ -57,7 +59,9 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev : m_device(std::move(device)) , m_src(std::move(src)) , m_native(std::move(nativeState)) -{} +{ + assert(m_native); +} CCUDAImportedMemory::~CCUDAImportedMemory() = default; diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp index fdbb56b0cf..6d980ed126 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp @@ -13,7 +13,9 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr makeExportableMemoryNativeState() + { + return std::unique_ptr(new CCUDAExportableMemory::SNativeState()); + } + static CUdeviceptr& deviceptr(CCUDAExportableMemory::SNativeState& nativeState) { return nativeState.ptr; } + static core::smart_refctd_ptr makeExportableMemory(core::smart_refctd_ptr device, CCUDAExportableMemory::SCachedCreationParams&& params, std::unique_ptr&& nativeState) + { + return CCUDAExportableMemory::create(std::move(device),std::move(params),std::move(nativeState)); + } static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; } static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; } diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index c75300016e..214d5add14 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -11,18 +11,26 @@ - Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=` when requesting `CUDAInterop`. - Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`. -## Usage +## Basic Usage ```cmake find_package(Nabla CONFIG REQUIRED) target_link_libraries(app PRIVATE Nabla::Nabla) ``` +This path does not require CUDA SDK headers on the consuming project. + +## Native Opt-In + +Use the native opt-in path only in targets that include `CUDAInteropNative.h` or use raw CUDA Driver API/NVRTC types. + ```cmake find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) nbl_target_link_cuda_interop(native_app PRIVATE) ``` +`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes runtime CUDA header discovery JSON for `native_app`. + ```cmake find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) nbl_target_link_cuda_interop(native_app PRIVATE @@ -36,19 +44,42 @@ nbl_target_link_cuda_interop(native_app PRIVATE ) ``` +Pseudo flow: + ```cpp #include "nbl/ext/CUDAInterop/CUDAInteropNative.h" -auto runtimeEnv = nbl::video::cuda_interop::findRuntimeCompileEnvironment(); -auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnv); +auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger)); +auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice); auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, { .size = size, .alignment = alignment, .location = CU_MEM_LOCATION_TYPE_DEVICE, }); + +std::string log; +auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX( + handler, + cudaSource, + "kernel.cu", + cudaDevice->geDefaultCompileOptions(), + 0, + nullptr, + nullptr, + &log +); ``` +`compileDirectlyToPTX` performs runtime CUDA header discovery internally. Code that drives NVRTC manually can call `cuda_interop::findRuntimeCompileEnvironment` and `cuda_interop::makeNVRTCIncludeOptions` directly. + +Reference smoke: + +- CMake target setup: `src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt` +- SDK-free package boundary check: `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` +- Default Nabla package usage without native opt-in: `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` +- Native CUDA opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC and raw interop usage: `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` + ## Runtime Header Discovery - `nbl_target_link_cuda_interop( )` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target. @@ -100,6 +131,7 @@ For Nabla consumers this means: - Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly. - CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI. - The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs. +- Native state is PIMPL-owned by Nabla. Consumers cannot construct CUDA wrapper objects with arbitrary internal state. - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. - `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor. - Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla. From e018545fb659ee74400a2635f93f502cd1d0f4f3 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 13:49:22 +0200 Subject: [PATCH 102/149] Clean up CUDA runtime header discovery --- src/nbl/ext/CUDAInterop/CCUDAHandler.cpp | 112 ++++++++++++----------- src/nbl/ext/CUDAInterop/README.md | 5 +- 2 files changed, 64 insertions(+), 53 deletions(-) diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp index 229a27cfac..de7f14b58f 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp @@ -17,6 +17,12 @@ namespace nbl::video::cuda_interop namespace { +#if defined(_NBL_PLATFORM_WINDOWS_) +inline constexpr char EnvironmentPathListSeparator = ';'; +#else +inline constexpr char EnvironmentPathListSeparator = ':'; +#endif + std::string readEnvironmentVariable(const char* name) { if (const char* value = std::getenv(name)) @@ -71,6 +77,39 @@ void appendIncludeDir(core::vector& includeDirs, system::path path includeDirs.push_back(std::move(path)); } +void appendCUDAIncludeDirsBelow(core::vector& includeDirs, const system::path& root, uint32_t maxDepth) +{ + if (!isDirectory(root)) + return; + + if (looksLikeCUDAIncludeDir(root)) + { + appendIncludeDir(includeDirs,root); + return; + } + if (maxDepth==0u) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + + std::error_code entryError; + if (!entry.is_directory(entryError)) + continue; + candidates.push_back(entry.path()); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendCUDAIncludeDirsBelow(includeDirs,candidate,maxDepth-1u); +} + void appendCUDAIncludeRoot(core::vector& includeDirs, const system::path& root) { if (root.empty()) @@ -102,24 +141,20 @@ void appendRuntimePathsConfig(core::vector& includeDirs, const sys appendIncludeDir(includeDirs,system::path(path.get())); } -void appendRuntimePathsConfigEnv(core::vector& includeDirs, const char* name) +template +void appendPathListEnv(const char* name, Append append) { const auto value = readEnvironmentVariable(name); if (value.empty()) return; - #if defined(_NBL_PLATFORM_WINDOWS_) - constexpr char Separator = ';'; - #else - constexpr char Separator = ':'; - #endif - size_t begin = 0; while (begin& includeDirs, const co for (const auto& runtimePathFile : explicitRuntimePathFiles) appendRuntimePathsConfig(includeDirs,runtimePathFile); - appendRuntimePathsConfigEnv(includeDirs,"NBL_CUDA_INTEROP_RUNTIME_JSON"); - appendRuntimePathsConfigEnv(includeDirs,"Nabla_CUDA_INTEROP_RUNTIME_JSON"); + const auto appendConfig = [&](const system::path& path) { appendRuntimePathsConfig(includeDirs,path); }; + appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",appendConfig); + appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",appendConfig); const auto exeDir = system::executableDirectory(); if (!exeDir.empty()) appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName); - - #if defined(_NBL_PLATFORM_WINDOWS_) - const auto releaseModuleDir = system::loadedModuleDirectory("Nabla.dll"); - if (!releaseModuleDir.empty()) - appendRuntimePathsConfig(includeDirs,releaseModuleDir/RuntimePathsFileName); - const auto debugModuleDir = system::loadedModuleDirectory("Nabla_debug.dll"); - if (!debugModuleDir.empty()) - appendRuntimePathsConfig(includeDirs,debugModuleDir/RuntimePathsFileName); - #endif } void appendAppLocalIncludeDirs(core::vector& includeDirs) @@ -155,9 +182,10 @@ void appendAppLocalIncludeDirs(core::vector& includeDirs) return; appendIncludeDir(includeDirs,exeDir/"cuda"/"include"); - appendIncludeDir(includeDirs,exeDir/"nvidia"/"cu13"/"include"); + appendCUDAIncludeDirsBelow(includeDirs,exeDir/"nvidia",4u); appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include"); appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include"); + appendCUDAIncludeDirsBelow(includeDirs,exeDir.parent_path()/"nvidia",4u); } void appendPythonPackageIncludeDirs(core::vector& includeDirs, const system::path& root) @@ -165,40 +193,17 @@ void appendPythonPackageIncludeDirs(core::vector& includeDirs, con if (root.empty()) return; - appendIncludeDir(includeDirs,root/"Lib"/"site-packages"/"nvidia"/"cu13"/"include"); - appendIncludeDir(includeDirs,root/"lib"/"site-packages"/"nvidia"/"cu13"/"include"); + appendCUDAIncludeDirsBelow(includeDirs,root/"Lib"/"site-packages"/"nvidia",4u); + appendCUDAIncludeDirsBelow(includeDirs,root/"lib"/"site-packages"/"nvidia",4u); appendIncludeDir(includeDirs,root/"Library"/"include"); appendIncludeDir(includeDirs,root/"include"); } -void appendPathListEnv(core::vector& includeDirs, const char* name) -{ - const auto value = readEnvironmentVariable(name); - if (value.empty()) - return; - - #if defined(_NBL_PLATFORM_WINDOWS_) - constexpr char Separator = ';'; - #else - constexpr char Separator = ':'; - #endif - - size_t begin = 0; - while (begin& includeDirs) { - appendPathListEnv(includeDirs,"NBL_CUDA_RUNTIME_INCLUDE_DIRS"); - appendPathListEnv(includeDirs,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS"); + const auto appendInclude = [&](const system::path& path) { appendIncludeDir(includeDirs,path); }; + appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude); + appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude); appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH")); appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME")); @@ -942,13 +947,18 @@ ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog) return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } +static const core::vector& getDefaultRuntimeIncludeOptions() +{ + static const auto RuntimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(cuda_interop::findRuntimeCompileEnvironment()); + return RuntimeIncludeOptions; +} + static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) { if (result!=NVRTC_SUCCESS) return {nullptr,result}; - const auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment(); - const auto runtimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment); + const auto& runtimeIncludeOptions = getDefaultRuntimeIncludeOptions(); core::vector options; options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size()); for (const auto option : nvrtcOptions) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 214d5add14..0d7b01a033 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -90,8 +90,9 @@ Reference smoke: - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. - The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths. - Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. -- Runtime lookup reads `nbl_cuda_interop_runtime.json` first, then checks app-local include bundles, explicit environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. -- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list. +- Runtime lookup reads explicit JSON paths and `NBL_CUDA_INTEROP_RUNTIME_JSON` first, then checks executable-local `nbl_cuda_interop_runtime.json`, app-local include bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. +- App-local and Python/conda package probing looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in the path. +- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list and caches the default discovery result after first use. - Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. - If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit. - `CUDA_PATH` is a developer fallback. It is not required for packaged applications. From c6ef6eea004ceeb2b25378f1312deb79cd21f283 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 14:30:58 +0200 Subject: [PATCH 103/149] Move CUDA interop API back into video --- include/nbl/ext/CUDAInterop/CUDAInterop.h | 13 -- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 2 +- include/nbl/ext/OptiX/IDenoiser.h | 2 +- .../{ext/CUDAInterop => video}/CCUDADevice.h | 6 +- .../CCUDAExportableMemory.h | 0 .../{ext/CUDAInterop => video}/CCUDAHandler.h | 0 .../CCUDAImportedMemory.h | 4 +- .../CCUDAImportedSemaphore.h | 0 include/nbl/video/CUDAInterop.h | 13 ++ src/nbl/CMakeLists.txt | 10 +- src/nbl/ext/CUDAInterop/README.md | 138 ++++++++---------- .../ext/CUDAInterop/smoke/clean_opt_in.cpp | 2 +- .../ext/CUDAInterop/smoke/public_boundary.cpp | 2 +- .../CUDAInterop => video}/CCUDADevice.cpp | 2 +- .../CCUDAExportableMemory.cpp | 2 +- .../CUDAInterop => video}/CCUDAHandler.cpp | 2 +- .../CCUDAImportedMemory.cpp | 2 +- .../CCUDAImportedSemaphore.cpp | 2 +- .../CUDAInteropNativeState.hpp | 4 +- 19 files changed, 91 insertions(+), 115 deletions(-) delete mode 100644 include/nbl/ext/CUDAInterop/CUDAInterop.h rename include/nbl/{ext/CUDAInterop => video}/CCUDADevice.h (94%) rename include/nbl/{ext/CUDAInterop => video}/CCUDAExportableMemory.h (100%) rename include/nbl/{ext/CUDAInterop => video}/CCUDAHandler.h (100%) rename include/nbl/{ext/CUDAInterop => video}/CCUDAImportedMemory.h (86%) rename include/nbl/{ext/CUDAInterop => video}/CCUDAImportedSemaphore.h (100%) create mode 100644 include/nbl/video/CUDAInterop.h rename src/nbl/{ext/CUDAInterop => video}/CCUDADevice.cpp (99%) rename src/nbl/{ext/CUDAInterop => video}/CCUDAExportableMemory.cpp (98%) rename src/nbl/{ext/CUDAInterop => video}/CCUDAHandler.cpp (99%) rename src/nbl/{ext/CUDAInterop => video}/CCUDAImportedMemory.cpp (97%) rename src/nbl/{ext/CUDAInterop => video}/CCUDAImportedSemaphore.cpp (97%) rename src/nbl/{ext/CUDAInterop => video}/CUDAInteropNativeState.hpp (96%) diff --git a/include/nbl/ext/CUDAInterop/CUDAInterop.h b/include/nbl/ext/CUDAInterop/CUDAInterop.h deleted file mode 100644 index 06d9016dc8..0000000000 --- a/include/nbl/ext/CUDAInterop/CUDAInterop.h +++ /dev/null @@ -1,13 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_EXT_CUDA_INTEROP_H_INCLUDED_ -#define _NBL_EXT_CUDA_INTEROP_H_INCLUDED_ - -#include "nbl/ext/CUDAInterop/CCUDADevice.h" -#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h" -#include "nbl/ext/CUDAInterop/CCUDAHandler.h" -#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" -#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h" - -#endif diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 6833ad8189..9d23fcb4ef 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -4,7 +4,7 @@ #ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ #define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #include "nbl/asset/ICPUBuffer.h" #include "nbl/system/DynamicFunctionCaller.h" diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h index 496383d92d..bb0677657d 100644 --- a/include/nbl/ext/OptiX/IDenoiser.h +++ b/include/nbl/ext/OptiX/IDenoiser.h @@ -5,7 +5,7 @@ #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ -#include "nbl/ext/CUDAInterop/CCUDAHandler.h" +#include "nbl/video/CCUDAHandler.h" #include #include diff --git a/include/nbl/ext/CUDAInterop/CCUDADevice.h b/include/nbl/video/CCUDADevice.h similarity index 94% rename from include/nbl/ext/CUDAInterop/CCUDADevice.h rename to include/nbl/video/CCUDADevice.h index 94eb450802..bc1931e363 100644 --- a/include/nbl/ext/CUDAInterop/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -5,9 +5,9 @@ #define _NBL_VIDEO_C_CUDA_DEVICE_H_ #include "nbl/video/declarations.h" -#include "nbl/ext/CUDAInterop/CCUDAExportableMemory.h" -#include "nbl/ext/CUDAInterop/CCUDAImportedMemory.h" -#include "nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h" +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDAImportedSemaphore.h" #include #include diff --git a/include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h similarity index 100% rename from include/nbl/ext/CUDAInterop/CCUDAExportableMemory.h rename to include/nbl/video/CCUDAExportableMemory.h diff --git a/include/nbl/ext/CUDAInterop/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h similarity index 100% rename from include/nbl/ext/CUDAInterop/CCUDAHandler.h rename to include/nbl/video/CCUDAHandler.h diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h similarity index 86% rename from include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h rename to include/nbl/video/CCUDAImportedMemory.h index 87f804ce76..ac41c110a2 100644 --- a/include/nbl/ext/CUDAInterop/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -1,5 +1,5 @@ -#ifndef _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_ -#define _NBL_EXT_CUDA_INTEROP_C_CUDA_IMPORTED_MEMORY_H_ +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ #include "nbl/video/declarations.h" diff --git a/include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h similarity index 100% rename from include/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.h rename to include/nbl/video/CCUDAImportedSemaphore.h diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h new file mode 100644 index 0000000000..57e92ae647 --- /dev/null +++ b/include/nbl/video/CUDAInterop.h @@ -0,0 +1,13 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ + +#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDAImportedSemaphore.h" + +#endif diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index ccb600ca32..d56c223e34 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -126,11 +126,11 @@ set(NBL_CORE_SOURCES ) set(NBL_CUDA_INTEROP_SOURCES - ext/CUDAInterop/CCUDADevice.cpp - ext/CUDAInterop/CCUDAExportableMemory.cpp - ext/CUDAInterop/CCUDAHandler.cpp - ext/CUDAInterop/CCUDAImportedMemory.cpp - ext/CUDAInterop/CCUDAImportedSemaphore.cpp + video/CCUDADevice.cpp + video/CCUDAExportableMemory.cpp + video/CCUDAHandler.cpp + video/CCUDAImportedMemory.cpp + video/CCUDAImportedSemaphore.cpp ) set(NBL_SYSTEM_SOURCES diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 0d7b01a033..e99edd82c0 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -1,50 +1,50 @@ -# CUDA Interop Targets +# CUDA Interop -- `Nabla::Nabla` owns the CUDA interop implementation and exported symbols. -- `Nabla::Nabla` public headers do not include `cuda.h` or `nvrtc.h`. -- The SDK-free interop headers stay stable for CUDA ON and CUDA OFF Nabla builds. -- `Nabla::ext::CUDAInterop` is the explicit raw CUDA Driver API and NVRTC opt-in target. -- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It does not build a library or executable artifact. -- The target only carries usage requirements and IDE-visible sources. -- `Nabla::ext::CUDAInterop` requires `CUDAToolkit` and exposes `CUDAInteropNative.h`. -- `CUDAInteropNative.h` is the small opt-in header that includes CUDA SDK headers such as `cuda.h` and `nvrtc.h`. -- Consumers can override the SDK root with `-DNabla_CUDA_TOOLKIT_ROOT=` when requesting `CUDAInterop`. -- Native accessors accept Nabla objects, raw pointers, and `smart_refctd_ptr`. +## Layout -## Basic Usage +- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`. +- Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers. +- `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. +- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*` accessors for CUDA Driver API and NVRTC types. + +## CMake Usage + +Default Nabla usage stays SDK-free: ```cmake find_package(Nabla CONFIG REQUIRED) target_link_libraries(app PRIVATE Nabla::Nabla) ``` -This path does not require CUDA SDK headers on the consuming project. - -## Native Opt-In - -Use the native opt-in path only in targets that include `CUDAInteropNative.h` or use raw CUDA Driver API/NVRTC types. +Native CUDA interop is explicit: ```cmake find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) nbl_target_link_cuda_interop(native_app PRIVATE) ``` -`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes runtime CUDA header discovery JSON for `native_app`. +`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes `nbl_cuda_interop_runtime.json` next to the target executable during CMake generation. + +Optional overrides: ```cmake find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) nbl_target_link_cuda_interop(native_app PRIVATE INCLUDE_DIRS "${cuda_runtime_headers}" ) -``` -```cmake nbl_target_link_cuda_interop(native_app PRIVATE RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$/my_cuda_runtime.json" ) ``` -Pseudo flow: +Consumers can also choose the SDK used for native compilation with: + +```cmake +cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT= +``` + +## Native Usage ```cpp #include "nbl/ext/CUDAInterop/CUDAInteropNative.h" @@ -71,85 +71,61 @@ auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX( ); ``` -`compileDirectlyToPTX` performs runtime CUDA header discovery internally. Code that drives NVRTC manually can call `cuda_interop::findRuntimeCompileEnvironment` and `cuda_interop::makeNVRTCIncludeOptions` directly. +Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly. + +Smoke examples: + +- `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free. +- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks default package usage without native opt-in. +- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks native opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage. -Reference smoke: +## ABI -- CMake target setup: `src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt` -- SDK-free package boundary check: `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` -- Default Nabla package usage without native opt-in: `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` -- Native CUDA opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC and raw interop usage: `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` +- `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI. +- Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. +- CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. +- `CUDAInteropNative.h` declares exported accessor functions whose definitions still live in `Nabla.dll`. +- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs. +- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime. ## Runtime Header Discovery -- `nbl_target_link_cuda_interop( )` links `Nabla::ext::CUDAInterop` and configures runtime include discovery for that target. -- The helper is defined once in `NablaCUDAInteropHelpers.cmake` and is available from the source tree and installed `NablaConfig.cmake`. -- For each target it writes `nbl_cuda_interop_runtime.json` next to the executable during CMake generation. -- `RUNTIME_JSON ` overrides the generated JSON location. Plain paths and `$` are supported. -- `cuda_interop::findRuntimeCompileEnvironment` can also receive explicit JSON paths at runtime. -- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. -- The JSON is a build artifact. Nabla packages do not install JSON files with host-specific CUDA paths. -- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. -- Runtime lookup reads explicit JSON paths and `NBL_CUDA_INTEROP_RUNTIME_JSON` first, then checks executable-local `nbl_cuda_interop_runtime.json`, app-local include bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. -- App-local and Python/conda package probing looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in the path. -- `cuda_native::compileDirectlyToPTX` appends discovered include directories to the NVRTC option list and caches the default discovery result after first use. -- Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. -- If an application compiles CUDA source with NVRTC and includes headers such as `cuda_fp16.h`, it must provide those runtime headers through the generated JSON path, an app-local bundle, a runtime/header package, or an installed toolkit. -- `CUDA_PATH` is a developer fallback. It is not required for packaged applications. -- Direct `target_link_libraries(app PRIVATE Nabla::ext::CUDAInterop)` remains possible, but it only adds compile/link usage requirements and does not create the runtime discovery JSON. +NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. -## Runtime Header Distribution +- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop. +- The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths. +- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. +- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. +- Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. +- The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. +- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use. -Nabla packages do not ship CUDA runtime headers. That is a packaging choice, not a hard legal requirement for applications that need NVRTC runtime compilation. +Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. -NVIDIA CUDA EULA limits CUDA redistribution to selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A then lists the CUDA Toolkit files that may be redistributed with applications. See: +Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See: - https://docs.nvidia.com/cuda/eula/#distribution - https://docs.nvidia.com/cuda/eula/#attachment-a -Relevant Attachment A header entries include: +Attachment A includes header groups relevant to NVRTC runtime compilation, including `nvrtc.h`, `cuda_fp16.h`, `cuda_bf16.h`, `cuda_fp8.h`, `cuda_fp6.h`, `cuda_fp4.h`, `cuda_runtime_api.h`, `cuda.h`, `vector_functions.h`, and `vector_types.h`. -- `nvrtc.h` under `NVIDIA Runtime Compilation Library and Header`. -- `cuda_occupancy.h` under `CUDA Occupancy Calculation Header Library`. -- `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` under `CUDA Floating Point Type Headers`. -- `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, `vector_types.h` under `CUDA Headers for Runtime Compilation`. - -CuPy documents the same runtime-compile problem. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They also show the common `vector_types.h` failure and recommend `nvidia-cuda-runtime-cu12` for PyPI installs or `cuda-cudart-dev` from system packages: +CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs: - https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6 - https://github.com/cupy/cupy/issues/8466 -For Nabla consumers this means: - -- The default Nabla package stays SDK-free for consumers that only link `Nabla::Nabla`. -- Native interop consumers can install CUDA runtime headers through an official package, point `NBL_CUDA_INTEROP_RUNTIME_JSON` at their own JSON, pass `INCLUDE_DIRS` to `nbl_target_link_cuda_interop`, or ship an app-local header bundle if their distribution model allows it. -- Shipping such headers is a consumer packaging decision. Nabla runtime discovery supports it, but Nabla does not install host-specific CUDA header paths or redistribute CUDA headers by default. - -## Properties - -- Consumers that only link `Nabla::Nabla` do not need CUDA SDK headers to parse Nabla headers. -- Consumers that need raw CUDA include `CUDAInteropNative.h` and link `Nabla::ext::CUDAInterop`. -- Raw CUDA access is not wrapped away in the native opt-in path. Native code uses CUDA Driver API and NVRTC types directly. -- CUDA SDK structs with version-sensitive layout are kept out of exported Nabla ABI. -- The exported native ABI uses stable CUDA Driver API handles/enums and small Nabla-owned parameter structs. -- Native state is PIMPL-owned by Nabla. Consumers cannot construct CUDA wrapper objects with arbitrary internal state. -- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. -- `CCUDAHandler::create` validates the loaded CUDA driver and NVRTC runtime. It returns `nullptr` when the runtime is missing or below the required CUDA 13.0 / NVRTC 13.x floor. -- Runtime CUDA header discovery is independent from the CUDA SDK used to build Nabla. -- Native consumers can use a newer compatible CUDA SDK or a runtime/header package without rebuilding Nabla. -- Toggling Nabla CUDA support does not change SDK-free public header parse requirements for consumers. -- The Nabla source list is stable. CUDA interop `.cpp` files stay visible in IDE projects for CUDA ON and CUDA OFF builds. -- CUDA OFF implementations are local stubs in the same `.cpp` files. SDK-free API entry points stay linkable and factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +## CUDA ON/OFF Builds + +- SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds. - CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`. +- CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +- The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds. ## Related Designs -This split follows the same public-boundary pattern used by mature GPU projects: SDK-free default headers, native access through an explicit opt-in path, and SDK-dependent implementation details outside the default public API. +The split follows the same boundary pattern used by mature GPU projects: default headers avoid vendor SDK requirements, native access is explicit, and implementation details stay outside the default public API. -- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: [`cuda_stream_accessor.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79). -- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: [`private.cuda.hpp`](https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61). -- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: [`device.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27). -- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: [`device_impl.h`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30), [`device.cpp`](https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.cpp#L10-L48). -- ONNX Runtime keeps accelerator dependencies behind execution providers and supports provider shared libraries loaded only when requested: [`Build with Execution Providers`](https://onnxruntime.ai/docs/build/eps.html#execution-provider-shared-libraries). -- ggml/llama.cpp keeps the generic backend API separate from CUDA and builds CUDA as an explicit backend target with CUDA libraries linked to that backend: [`ggml-backend.h`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/include/ggml-backend.h#L1488-L1499), [`ggml-cuda CMakeLists.txt`](https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-cuda/CMakeLists.txt#L982-L1072). -- TensorFlow PluggableDevice uses separate device plugin packages so accelerator toolchains and dependencies do not become core TensorFlow requirements: [`PluggableDevice`](https://blog.tensorflow.org/2021/06/pluggabledevice-device-plugins-for-TensorFlow.html). +- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79 +- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61 +- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27 +- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30 diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp index e36fe65701..31bf461804 100644 --- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp @@ -1,4 +1,4 @@ -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #include "nbl/system/IApplicationFramework.h" #include diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp index eb7061f0ee..dc1c247806 100644 --- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -18,7 +18,7 @@ #error "Nabla consumers must not include CUDA SDK headers." #endif -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #error "Nabla consumers must not get the CUDA opt-in define." diff --git a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp similarity index 99% rename from src/nbl/ext/CUDAInterop/CCUDADevice.cpp rename to src/nbl/video/CCUDADevice.cpp index 8e696d0827..fcafc8bc48 100644 --- a/src/nbl/ext/CUDAInterop/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -1,7 +1,7 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "CUDAInteropNativeState.hpp" diff --git a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp similarity index 98% rename from src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp rename to src/nbl/video/CCUDAExportableMemory.cpp index 7d5483af04..4eb37b720a 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -2,7 +2,7 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "CUDAInteropNativeState.hpp" diff --git a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp similarity index 99% rename from src/nbl/ext/CUDAInterop/CCUDAHandler.cpp rename to src/nbl/video/CCUDAHandler.cpp index de7f14b58f..ced76b9713 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -2,7 +2,7 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #include "nbl/system/ModuleLookupUtils.h" #include "nlohmann/json.hpp" diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp similarity index 97% rename from src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp rename to src/nbl/video/CCUDAImportedMemory.cpp index 3a8ed56371..9e58fbac10 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -2,7 +2,7 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "CUDAInteropNativeState.hpp" diff --git a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp similarity index 97% rename from src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp rename to src/nbl/video/CCUDAImportedSemaphore.cpp index 6d980ed126..bc1db625d1 100644 --- a/src/nbl/ext/CUDAInterop/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -2,7 +2,7 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/ext/CUDAInterop/CUDAInterop.h" +#include "nbl/video/CUDAInterop.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "CUDAInteropNativeState.hpp" diff --git a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp similarity index 96% rename from src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp rename to src/nbl/video/CUDAInteropNativeState.hpp index 74cb7823d5..79139d015d 100644 --- a/src/nbl/ext/CUDAInterop/CUDAInteropNativeState.hpp +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -1,5 +1,5 @@ -#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ -#define _NBL_EXT_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ +#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ #include "nbl/ext/CUDAInterop/CUDAInteropNative.h" From d559a2caeafa9aef0c308b7716c77d4be076fc28 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 15:02:03 +0200 Subject: [PATCH 104/149] Move smart pointer helpers into core --- include/nbl/core/decl/smart_refctd_ptr.h | 38 ++++++++ .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 90 ++++++------------- 2 files changed, 66 insertions(+), 62 deletions(-) diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h index 7c231fea4b..78609fa34c 100644 --- a/include/nbl/core/decl/smart_refctd_ptr.h +++ b/include/nbl/core/decl/smart_refctd_ptr.h @@ -7,6 +7,10 @@ #include "nbl/core/IReferenceCounted.h" +#include +#include +#include + namespace nbl::core { @@ -118,6 +122,40 @@ class smart_refctd_ptr }; static_assert(sizeof(smart_refctd_ptr) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!"); +template +struct is_smart_refctd_ptr : std::false_type {}; + +template +struct is_smart_refctd_ptr> : std::true_type {}; + +template +inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr>::value; + +template +inline constexpr bool is_raw_pointer_or_smart_refctd_ptr_v = std::is_pointer_v> || is_smart_refctd_ptr_v; + +template +decltype(auto) dereference(Object&& object) +{ + using object_t = std::remove_cvref_t; + if constexpr (std::is_pointer_v) + return *object; + else if constexpr (is_smart_refctd_ptr_v) + return *object; + else + return std::forward(object); +} + +template +concept dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v && requires(Object&& object) { + { dereference(std::forward(object)) } -> std::convertible_to; +}; + +template +concept const_dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v && requires(Object&& object) { + { dereference(std::forward(object)) } -> std::convertible_to; +}; + template< class T, class... Args > smart_refctd_ptr make_smart_refctd_ptr(Args&& ... args); diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 9d23fcb4ef..fe5fb5875e 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -161,40 +161,6 @@ struct SExportableMemoryCreationParams namespace detail { -template -struct is_smart_refctd_ptr : std::false_type {}; - -template -struct is_smart_refctd_ptr> : std::true_type {}; - -template -inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr>::value; - -template -inline constexpr bool is_indirect_object_v = std::is_pointer_v> || is_smart_refctd_ptr_v; - -template -decltype(auto) as_ref(Object&& object) -{ - using object_t = std::remove_cvref_t; - if constexpr (std::is_pointer_v) - return *object; - else if constexpr (is_smart_refctd_ptr_v) - return *object; - else - return std::forward(object); -} - -template -concept object_like = is_indirect_object_v && requires(Object&& object) { - { as_ref(std::forward(object)) } -> std::convertible_to; -}; - -template -concept const_object_like = is_indirect_object_v && requires(Object&& object) { - { as_ref(std::forward(object)) } -> std::convertible_to; -}; - template concept program_text_source = std::same_as, std::string> || std::convertible_to; @@ -205,17 +171,17 @@ NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); template -requires detail::const_object_like +requires core::const_dereferenceable_to inline const CUDA& getCUDAFunctionTable(Handler&& handler) { - return getCUDAFunctionTable(detail::as_ref(std::forward(handler))); + return getCUDAFunctionTable(core::dereference(std::forward(handler))); } template -requires detail::const_object_like +requires core::const_dereferenceable_to inline const NVRTC& getNVRTCFunctionTable(Handler&& handler) { - return getNVRTCFunctionTable(detail::as_ref(std::forward(handler))); + return getNVRTCFunctionTable(core::dereference(std::forward(handler))); } NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); @@ -228,10 +194,10 @@ T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } NBL_API2 const core::vector& getAvailableDevices(const CCUDAHandler& handler); template -requires detail::const_object_like +requires core::const_dereferenceable_to inline const core::vector& getAvailableDevices(Handler&& handler) { - return getAvailableDevices(detail::as_ref(std::forward(handler))); + return getAvailableDevices(core::dereference(std::forward(handler))); } NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); @@ -242,10 +208,10 @@ inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, cons NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); template -requires detail::object_like && detail::program_text_source +requires core::dereferenceable_to && detail::program_text_source inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { - auto& handlerRef = detail::as_ref(std::forward(handler)); + auto& handlerRef = core::dereference(std::forward(handler)); if constexpr (std::same_as, std::string>) return createProgram(handlerRef,prog,std::string(std::forward(source)),name,headerCount,headerContents,includeNames); else @@ -256,10 +222,10 @@ inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& } template -requires detail::object_like && std::convertible_to +requires core::dereferenceable_to && std::convertible_to inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { - return createProgram(detail::as_ref(std::forward(handler)),prog,static_cast(file),headerCount,headerContents,includeNames); + return createProgram(core::dereference(std::forward(handler)),prog,static_cast(file),headerCount,headerContents,includeNames); } NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); @@ -291,14 +257,14 @@ NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX( ); template -requires detail::object_like && detail::program_text_source +requires core::dereferenceable_to && detail::program_text_source inline ptx_and_nvrtcResult_t compileDirectlyToPTX( Handler&& handler, Source&& source, const char* filename, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr ) { - auto& handlerRef = detail::as_ref(std::forward(handler)); + auto& handlerRef = core::dereference(std::forward(handler)); if constexpr (std::same_as, std::string>) return compileDirectlyToPTX(handlerRef,std::string(std::forward(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); else @@ -309,14 +275,14 @@ inline ptx_and_nvrtcResult_t compileDirectlyToPTX( } template -requires detail::object_like && std::convertible_to +requires core::dereferenceable_to && std::convertible_to inline ptx_and_nvrtcResult_t compileDirectlyToPTX( Handler&& handler, File file, core::SRange nvrtcOptions, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, std::string* log=nullptr ) { - return compileDirectlyToPTX(detail::as_ref(std::forward(handler)),static_cast(file),nvrtcOptions,headerCount,headerContents,includeNames,log); + return compileDirectlyToPTX(core::dereference(std::forward(handler)),static_cast(file),nvrtcOptions,headerCount,headerContents,includeNames,log); } NBL_API2 CUdevice getInternalObject(const CCUDADevice& device); @@ -330,48 +296,48 @@ NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& sem template requires ( - detail::const_object_like || - detail::const_object_like || - detail::const_object_like + core::const_dereferenceable_to || + core::const_dereferenceable_to || + core::const_dereferenceable_to ) inline auto getInternalObject(Object&& object) { - return getInternalObject(detail::as_ref(std::forward(object))); + return getInternalObject(core::dereference(std::forward(object))); } template -requires detail::const_object_like +requires core::const_dereferenceable_to inline CUcontext getContext(Device&& device) { - return getContext(detail::as_ref(std::forward(device))); + return getContext(core::dereference(std::forward(device))); } template -requires detail::const_object_like +requires core::const_dereferenceable_to inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size) { - return roundToGranularity(detail::as_ref(std::forward(device)),location,size); + return roundToGranularity(core::dereference(std::forward(device)),location,size); } template -requires detail::object_like +requires core::dereferenceable_to inline core::smart_refctd_ptr createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params) { - return createExportableMemory(detail::as_ref(std::forward(device)),std::move(params)); + return createExportableMemory(core::dereference(std::forward(device)),std::move(params)); } template -requires detail::const_object_like +requires core::const_dereferenceable_to inline CUdeviceptr getDeviceptr(Memory&& memory) { - return getDeviceptr(detail::as_ref(std::forward(memory))); + return getDeviceptr(core::dereference(std::forward(memory))); } template -requires detail::const_object_like +requires core::const_dereferenceable_to inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer) { - return getMappedBuffer(detail::as_ref(std::forward(memory)),mappedBuffer); + return getMappedBuffer(core::dereference(std::forward(memory)),mappedBuffer); } } From 38705b93794e820417a2b3f223d258e07aeebb8f Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 16:06:26 +0200 Subject: [PATCH 105/149] Use CUDA interop accessors --- examples_tests | 2 +- include/nbl/core/decl/smart_refctd_ptr.h | 39 ---- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 219 ++++-------------- src/nbl/ext/CUDAInterop/README.md | 22 +- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 18 +- src/nbl/video/CCUDADevice.cpp | 51 ++-- src/nbl/video/CCUDAExportableMemory.cpp | 10 +- src/nbl/video/CCUDAHandler.cpp | 61 ++--- src/nbl/video/CCUDAImportedMemory.cpp | 11 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 7 +- src/nbl/video/CUDAInteropNativeState.hpp | 17 +- 11 files changed, 135 insertions(+), 322 deletions(-) diff --git a/examples_tests b/examples_tests index b2c639c8b7..1dc7f6a075 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit b2c639c8b71c3b860418dc4b3e46ad147ba5f256 +Subproject commit 1dc7f6a075c8c457b80388e59ef3da846bad03e4 diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h index 78609fa34c..814c807a84 100644 --- a/include/nbl/core/decl/smart_refctd_ptr.h +++ b/include/nbl/core/decl/smart_refctd_ptr.h @@ -7,10 +7,6 @@ #include "nbl/core/IReferenceCounted.h" -#include -#include -#include - namespace nbl::core { @@ -122,41 +118,6 @@ class smart_refctd_ptr }; static_assert(sizeof(smart_refctd_ptr) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!"); -template -struct is_smart_refctd_ptr : std::false_type {}; - -template -struct is_smart_refctd_ptr> : std::true_type {}; - -template -inline constexpr bool is_smart_refctd_ptr_v = is_smart_refctd_ptr>::value; - -template -inline constexpr bool is_raw_pointer_or_smart_refctd_ptr_v = std::is_pointer_v> || is_smart_refctd_ptr_v; - -template -decltype(auto) dereference(Object&& object) -{ - using object_t = std::remove_cvref_t; - if constexpr (std::is_pointer_v) - return *object; - else if constexpr (is_smart_refctd_ptr_v) - return *object; - else - return std::forward(object); -} - -template -concept dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v && requires(Object&& object) { - { dereference(std::forward(object)) } -> std::convertible_to; -}; - -template -concept const_dereferenceable_to = is_raw_pointer_or_smart_refctd_ptr_v && requires(Object&& object) { - { dereference(std::forward(object)) } -> std::convertible_to; -}; - - template< class T, class... Args > smart_refctd_ptr make_smart_refctd_ptr(Args&& ... args); diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index fe5fb5875e..57669f591a 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -9,10 +9,7 @@ #include "nbl/asset/ICPUBuffer.h" #include "nbl/system/DynamicFunctionCaller.h" -#include #include -#include -#include #include "cuda.h" #include "nvrtc.h" @@ -158,196 +155,62 @@ struct SExportableMemoryCreationParams CUmemLocationType location; }; -namespace detail -{ - -template -concept program_text_source = std::same_as, std::string> || - std::convertible_to; - -} - -NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); -NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); - -template -requires core::const_dereferenceable_to -inline const CUDA& getCUDAFunctionTable(Handler&& handler) -{ - return getCUDAFunctionTable(core::dereference(std::forward(handler))); -} - -template -requires core::const_dereferenceable_to -inline const NVRTC& getNVRTCFunctionTable(Handler&& handler) -{ - return getNVRTCFunctionTable(core::dereference(std::forward(handler))); -} - -NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); -NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); -NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); - -template -T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } - -NBL_API2 const core::vector& getAvailableDevices(const CCUDAHandler& handler); - -template -requires core::const_dereferenceable_to -inline const core::vector& getAvailableDevices(Handler&& handler) -{ - return getAvailableDevices(core::dereference(std::forward(handler))); -} - -NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); -inline nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) -{ - return createProgram(handler,prog,std::string(source),name,headerCount,headerContents,includeNames); -} -NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - -template -requires core::dereferenceable_to && detail::program_text_source -inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, Source&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) -{ - auto& handlerRef = core::dereference(std::forward(handler)); - if constexpr (std::same_as, std::string>) - return createProgram(handlerRef,prog,std::string(std::forward(source)),name,headerCount,headerContents,includeNames); - else - { - const char* sourceText = source; - return createProgram(handlerRef,prog,sourceText,name,headerCount,headerContents,includeNames); - } -} - -template -requires core::dereferenceable_to && std::convertible_to -inline nvrtcResult createProgram(Handler&& handler, nvrtcProgram* prog, File file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) -{ - return createProgram(core::dereference(std::forward(handler)),prog,static_cast(file),headerCount,headerContents,includeNames); -} -NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); -NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); - struct ptx_and_nvrtcResult_t { core::smart_refctd_ptr ptx; nvrtcResult result; }; -NBL_API2 ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog); -NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX( - CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -); -inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - CCUDAHandler& handler, const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -) -{ - return compileDirectlyToPTX(handler,std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); -} -NBL_API2 ptx_and_nvrtcResult_t compileDirectlyToPTX( - CCUDAHandler& handler, system::IFile* file, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -); - -template -requires core::dereferenceable_to && detail::program_text_source -inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - Handler&& handler, Source&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -) -{ - auto& handlerRef = core::dereference(std::forward(handler)); - if constexpr (std::same_as, std::string>) - return compileDirectlyToPTX(handlerRef,std::string(std::forward(source)),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); - else - { - const char* sourceText = source; - return compileDirectlyToPTX(handlerRef,sourceText,filename,nvrtcOptions,headerCount,headerContents,includeNames,log); - } -} - -template -requires core::dereferenceable_to && std::convertible_to -inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - Handler&& handler, File file, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr -) -{ - return compileDirectlyToPTX(core::dereference(std::forward(handler)),static_cast(file),nvrtcOptions,headerCount,headerContents,includeNames,log); -} - -NBL_API2 CUdevice getInternalObject(const CCUDADevice& device); -NBL_API2 CUcontext getContext(const CCUDADevice& device); -NBL_API2 size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); -NBL_API2 core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params); -NBL_API2 CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); -NBL_API2 CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); -NBL_API2 CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); -NBL_API2 CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); - -template -requires ( - core::const_dereferenceable_to || - core::const_dereferenceable_to || - core::const_dereferenceable_to -) -inline auto getInternalObject(Object&& object) -{ - return getInternalObject(core::dereference(std::forward(object))); -} - -template -requires core::const_dereferenceable_to -inline CUcontext getContext(Device&& device) -{ - return getContext(core::dereference(std::forward(device))); -} +// These are opt-in CUDA-native declarations for symbols implemented and exported by Nabla. +// Only consumers that include this header and link Nabla::ext::CUDAInterop see CUDA SDK types. +class NBL_API2 CCUDAHandlerAccessor +{ + public: + static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); + static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); + static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); + static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); + static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); + static const core::vector& getAvailableDevices(const CCUDAHandler& handler); + static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); + static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); + static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); + static ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog); + static ptx_and_nvrtcResult_t compileDirectlyToPTX( + CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, + std::string* log=nullptr + ); +}; -template -requires core::const_dereferenceable_to -inline size_t roundToGranularity(Device&& device, CUmemLocationType location, size_t size) +class NBL_API2 CCUDADeviceAccessor { - return roundToGranularity(core::dereference(std::forward(device)),location,size); -} + public: + static CUdevice getInternalObject(const CCUDADevice& device); + static CUcontext getContext(const CCUDADevice& device); + static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); + static core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params); +}; -template -requires core::dereferenceable_to -inline core::smart_refctd_ptr createExportableMemory(Device&& device, SExportableMemoryCreationParams&& params) +class NBL_API2 CCUDAExportableMemoryAccessor { - return createExportableMemory(core::dereference(std::forward(device)),std::move(params)); -} + public: + static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); +}; -template -requires core::const_dereferenceable_to -inline CUdeviceptr getDeviceptr(Memory&& memory) +class NBL_API2 CCUDAImportedMemoryAccessor { - return getDeviceptr(core::dereference(std::forward(memory))); -} + public: + static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); + static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); +}; -template -requires core::const_dereferenceable_to -inline CUresult getMappedBuffer(Memory&& memory, CUdeviceptr* mappedBuffer) +class NBL_API2 CCUDAImportedSemaphoreAccessor { - return getMappedBuffer(core::dereference(std::forward(memory)),mappedBuffer); -} + public: + static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); +}; } -#define ASSERT_CUDA_SUCCESS(expr, handler) \ - do { \ - const auto cudaResult = (expr); \ - if (!nbl::video::cuda_native::defaultHandleResult(*(handler), cudaResult)) { \ - assert(false); \ - } \ - } while(0) - #endif diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index e99edd82c0..ea92dcec7d 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -5,7 +5,7 @@ - `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`. - Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers. - `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. -- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*` accessors for CUDA Driver API and NVRTC types. +- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*Accessor` classes for CUDA Driver API and NVRTC types. ## CMake Usage @@ -52,16 +52,17 @@ cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT= auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger)); auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice); -auto memory = nbl::video::cuda_native::createExportableMemory(cudaDevice, { +auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { .size = size, .alignment = alignment, .location = CU_MEM_LOCATION_TYPE_DEVICE, }); std::string log; -auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX( - handler, - cudaSource, +std::string cudaSource = loadKernelText(); +auto [ptx, result] = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( + *handler, + std::move(cudaSource), "kernel.cu", cudaDevice->geDefaultCompileOptions(), 0, @@ -71,7 +72,12 @@ auto [ptx, result] = nbl::video::cuda_native::compileDirectlyToPTX( ); ``` -Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly. +Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly through accessor classes: + +- `CCUDAHandlerAccessor` exposes CUDA/NVRTC function tables, NVRTC program helpers, PTX compilation, native device enumeration, and default error handling. +- `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation. +- `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop. +- Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads. Smoke examples: @@ -84,7 +90,7 @@ Smoke examples: - `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI. - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. -- `CUDAInteropNative.h` declares exported accessor functions whose definitions still live in `Nabla.dll`. +- `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI. - Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs. - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime. @@ -98,7 +104,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. -- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use. +- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use. Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 3b799a56cf..0b07bfa137 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -23,7 +23,7 @@ using namespace nbl::video; core::smart_refctd_ptr vulkanMemory, core::smart_refctd_ptr vulkanSemaphore) { - auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, { + auto cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(cudaDevice, { .size = 4096, .alignment = 4096, .location = CU_MEM_LOCATION_TYPE_DEVICE, @@ -37,16 +37,16 @@ using namespace nbl::video; CUdeviceptr mappedVulkanMemory = 0; if (importedFromVulkan) - cuda_native::getMappedBuffer(importedFromVulkan,&mappedVulkanMemory); + cuda_native::CCUDAImportedMemoryAccessor::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory); - const CUdeviceptr cudaDevicePtr = cuda_native::getDeviceptr(cudaMemory); - const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::getInternalObject(importedSemaphore):nullptr; + const CUdeviceptr cudaDevicePtr = cuda_native::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaMemory); + const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::CCUDAImportedSemaphoreAccessor::getInternalObject(*importedSemaphore):nullptr; return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; } bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) { - auto& cuda = cuda_native::getCUDAFunctionTable(handler); + auto& cuda = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(handler); CUcontext context = nullptr; if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) @@ -95,9 +95,9 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) )cuda"; std::string log; - auto [ptx, result] = cuda_native::compileDirectlyToPTX( + auto [ptx, result] = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( handler, - Source, + std::string(Source), "cuda_fp16_discovery_probe.cu", {nullptr,nullptr}, 0, @@ -121,7 +121,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!isAPILoaded()) return false; - static_assert(std::is_same_v())), CUdevice>); + static_assert(std::is_same_v())), CUdevice>); #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}); @@ -144,7 +144,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!cudaFp16HeaderCompileProbe(*handler)) return false; - const auto& devices = nbl::video::cuda_native::getAvailableDevices(handler); + const auto& devices = nbl::video::cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler); if (devices.empty()) return true; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index fcafc8bc48..359cd093a1 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -34,10 +34,12 @@ CCUDADevice::CCUDADevice( m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle), m_handler); - ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_native->context), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) + assert(false); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context))) + assert(false); for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType) { @@ -50,30 +52,31 @@ CCUDADevice::CCUDADevice( const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = cuda_native::getAllocationHandleType(), + .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(), .location = { .type = static_cast(locationType), .id = m_native->handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) + assert(false); } } namespace cuda_native { -CUdevice getInternalObject(const CCUDADevice& device) +CUdevice CCUDADeviceAccessor::getInternalObject(const CCUDADevice& device) { return SAccess::native(device).handle; } -CUcontext getContext(const CCUDADevice& device) +CUcontext CCUDADeviceAccessor::getContext(const CCUDADevice& device) { return SAccess::native(device).context; } -size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size) +size_t CCUDADeviceAccessor::roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size) { const auto& granularity = SAccess::native(device).allocationGranularity[location]; return ((size - 1) / granularity + 1) * granularity; @@ -90,7 +93,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept { const auto handler = device.getHandler(); const auto& native = cuda_native::SAccess::native(device); - const auto& cu = cuda_native::getCUDAFunctionTable(*handler); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler); CUdeviceptr ptr = 0; if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) @@ -98,7 +101,8 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) + assert(false); return err; } @@ -109,8 +113,10 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), handler); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size))) + assert(false); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) + assert(false); return err; } @@ -122,7 +128,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept namespace cuda_native { -core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams) +core::smart_refctd_ptr CCUDADeviceAccessor::createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams) { const auto handler = device.getHandler(); auto& native = SAccess::native(device); @@ -131,11 +137,11 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice CCUDAExportableMemory::SCachedCreationParams params = { .size = inParams.size, .alignment = inParams.alignment, - .granularSize = roundToGranularity(device, inParams.location, inParams.size), + .granularSize = CCUDADeviceAccessor::roundToGranularity(device, inParams.location, inParams.size), .deviceLocal = isDeviceLocal(inParams.location) }; - auto& cu = getCUDAFunctionTable(*handler); + auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*handler); #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { @@ -145,7 +151,7 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = getAllocationHandleType(), + .requestedHandleTypes = SAccess::allocationHandleType(), .location = { .type = inParams.location, .id = native.handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, @@ -164,7 +170,8 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler); + if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) + assert(false); return nullptr; } @@ -172,7 +179,8 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice { logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), handler); + if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) + assert(false); bool closeSucceed = CloseExternalHandle(params.externalHandle); assert(closeSucceed); @@ -194,7 +202,7 @@ core::smart_refctd_ptr createExportableMemory(CCUDADevice core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); const auto handleType = mem->getCreationParams().externalHandleType; if (!handleType) return nullptr; @@ -225,7 +233,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); + auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); auto handleType = sema->getCreationParams().externalHandleTypes.value; if (!handleType) @@ -258,7 +266,8 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CCUDADevice::~CCUDADevice() { - ASSERT_CUDA_SUCCESS(cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context), m_handler); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context))) + assert(false); } } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 4eb37b720a..f84169e38f 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -52,11 +52,13 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM CCUDAExportableMemory::~CCUDAExportableMemory() { - const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); - ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize), m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize))) + assert(false); - ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize), m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize))) + assert(false); bool closeSucceed = CloseExternalHandle(m_params.externalHandle); assert(closeSucceed); @@ -66,7 +68,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory() namespace cuda_native { -CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory) +CUdeviceptr CCUDAExportableMemoryAccessor::getDeviceptr(const CCUDAExportableMemory& memory) { return SAccess::native(memory).ptr; } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index ced76b9713..0064a191a6 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -355,7 +355,7 @@ CCUDAHandler::~CCUDAHandler() = default; namespace cuda_native { -bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) +bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) { @@ -721,12 +721,12 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) return false; } -bool defaultHandleResult(const CCUDAHandler& handler, CUresult result) +bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, CUresult result) { - return defaultHandleResult(result,SAccess::logger(handler)); + return CCUDAHandlerAccessor::defaultHandleResult(result,SAccess::logger(handler)); } -bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) +bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) { switch (result) { @@ -874,22 +874,22 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste namespace cuda_native { -const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler) +const CUDA& CCUDAHandlerAccessor::getCUDAFunctionTable(const CCUDAHandler& handler) { return SAccess::native(handler).cuda; } -const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler) +const NVRTC& CCUDAHandlerAccessor::getNVRTCFunctionTable(const CCUDAHandler& handler) { return SAccess::native(handler).nvrtc; } -const core::vector& getAvailableDevices(const CCUDAHandler& handler) +const core::vector& CCUDAHandlerAccessor::getAvailableDevices(const CCUDAHandler& handler) { return SAccess::native(handler).availableDevices; } -nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -901,24 +901,12 @@ nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); } -nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, system::IFile* file, const int headerCount, const char* const* headerContents, const char* const* includeNames) -{ - const auto filesize = file->getSize(); - std::string source(filesize+1u,'0'); - - system::IFile::success_t bytesRead; - file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.getBytesProcessed()); - - return createProgram(handler,prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); -} - -nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) +nvrtcResult CCUDAHandlerAccessor::compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) { return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); } -nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) +nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) { size_t _size = 0ull; nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size); @@ -931,7 +919,7 @@ nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::s return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data()); } -ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog) +ptx_and_nvrtcResult_t CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog) { size_t _size = 0ull; nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size); @@ -968,16 +956,16 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv const auto* optionsBegin = options.empty() ? nullptr:options.data(); const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); - result = compileProgram(handler,program,{optionsBegin,optionsEnd}); + result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd}); if (log) - getProgramLog(handler,program,*log); + CCUDAHandlerAccessor::getProgramLog(handler,program,*log); if (result!=NVRTC_SUCCESS) return {nullptr,result}; - return getPTX(handler,program); + return CCUDAHandlerAccessor::getPTX(handler,program); } -ptx_and_nvrtcResult_t compileDirectlyToPTX( +ptx_and_nvrtcResult_t CCUDAHandlerAccessor::compileDirectlyToPTX( CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, const int headerCount, const char* const* headerContents, const char* const* includeNames, std::string* log) @@ -990,24 +978,7 @@ ptx_and_nvrtcResult_t compileDirectlyToPTX( SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); }); - result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log); -} - -ptx_and_nvrtcResult_t compileDirectlyToPTX( - CCUDAHandler& handler, system::IFile* file, core::SRange nvrtcOptions, - const int headerCount, const char* const* headerContents, const char* const* includeNames, - std::string* log) -{ - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); - }); - - result = createProgram(handler,&program,file,headerCount,headerContents,includeNames); + result = CCUDAHandlerAccessor::createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log); } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 9e58fbac10..9145fe18ac 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -21,18 +21,18 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev namespace cuda_native { -CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory) +CUexternalMemory CCUDAImportedMemoryAccessor::getInternalObject(const CCUDAImportedMemory& memory) { return SAccess::native(memory).handle; } -CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer) +CUresult CCUDAImportedMemoryAccessor::getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer) { CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; bufferDesc.offset = 0; bufferDesc.size = SAccess::source(memory)->getAllocationSize(); - const auto& cu = getCUDAFunctionTable(*SAccess::device(memory)->getHandler()); + const auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*SAccess::device(memory)->getHandler()); return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc); } @@ -41,8 +41,9 @@ CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedB CCUDAImportedMemory::~CCUDAImportedMemory() { - auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_native->handle), m_device->getHandler()); + auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle))) + assert(false); } } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index bc1db625d1..5d7d3e07ae 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -20,7 +20,7 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptrgetHandler()); - ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_native->handle), m_device->getHandler()); + auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); + if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle))) + assert(false); } } diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp index 79139d015d..7e602bb0f3 100644 --- a/src/nbl/video/CUDAInteropNativeState.hpp +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -57,15 +57,6 @@ struct CCUDAImportedSemaphore::SNativeState namespace cuda_native { -inline CUmemAllocationHandleType getAllocationHandleType() -{ -#ifdef _WIN32 - return CU_MEM_HANDLE_TYPE_WIN32; -#else - return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; -#endif -} - struct SAccess { static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; } @@ -96,6 +87,14 @@ struct SAccess static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; } static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); } static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); } + static CUmemAllocationHandleType allocationHandleType() + { + #ifdef _WIN32 + return CU_MEM_HANDLE_TYPE_WIN32; + #else + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + #endif + } }; } From 23e6ef5235ebf2b6f86694652f437b37b0479c53 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 16:36:31 +0200 Subject: [PATCH 106/149] Use explicit CUDA compile log --- examples_tests | 2 +- include/nbl/ext/CUDAInterop/CUDAInteropNative.h | 9 ++++----- src/nbl/ext/CUDAInterop/README.md | 7 ++++--- src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp | 8 ++++---- src/nbl/video/CCUDAHandler.cpp | 15 +++++++-------- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/examples_tests b/examples_tests index 1dc7f6a075..3c57a88af9 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 1dc7f6a075c8c457b80388e59ef3da846bad03e4 +Subproject commit 3c57a88af9eba722fcc6b5b5ba3d136ab3e166ca diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 57669f591a..d409c774e1 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -155,7 +155,7 @@ struct SExportableMemoryCreationParams CUmemLocationType location; }; -struct ptx_and_nvrtcResult_t +struct SPTXResult { core::smart_refctd_ptr ptx; nvrtcResult result; @@ -175,11 +175,10 @@ class NBL_API2 CCUDAHandlerAccessor static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); - static ptx_and_nvrtcResult_t getPTX(const CCUDAHandler& handler, nvrtcProgram prog); - static ptx_and_nvrtcResult_t compileDirectlyToPTX( + static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); + static SPTXResult compileDirectlyToPTX( CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr + std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr ); }; diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index ea92dcec7d..7d350da379 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -60,15 +60,15 @@ auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemo std::string log; std::string cudaSource = loadKernelText(); -auto [ptx, result] = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( +auto compile = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( *handler, std::move(cudaSource), "kernel.cu", cudaDevice->geDefaultCompileOptions(), + log, 0, nullptr, - nullptr, - &log + nullptr ); ``` @@ -78,6 +78,7 @@ Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC ty - `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation. - `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop. - Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads. +- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. There is no optional output pointer in the public API. Smoke examples: diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 0b07bfa137..ace1059215 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -95,17 +95,17 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) )cuda"; std::string log; - auto [ptx, result] = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( + auto compile = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( handler, std::string(Source), "cuda_fp16_discovery_probe.cu", {nullptr,nullptr}, + log, 0, nullptr, - nullptr, - &log + nullptr ); - return result==NVRTC_SUCCESS && ptx && ptx->getSize()>0u; + return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u; } } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 0064a191a6..9db99e7642 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -919,7 +919,7 @@ nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvr return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data()); } -ptx_and_nvrtcResult_t CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog) +SPTXResult CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog) { size_t _size = 0ull; nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size); @@ -941,8 +941,9 @@ static const core::vector& getDefaultRuntimeIncludeOptions() return RuntimeIncludeOptions; } -static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) +static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string& log) { + log.clear(); if (result!=NVRTC_SUCCESS) return {nullptr,result}; @@ -957,24 +958,22 @@ static ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(CCUDAHandler& handler, nv const auto* optionsBegin = options.empty() ? nullptr:options.data(); const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd}); - if (log) - CCUDAHandlerAccessor::getProgramLog(handler,program,*log); + CCUDAHandlerAccessor::getProgramLog(handler,program,log); if (result!=NVRTC_SUCCESS) return {nullptr,result}; return CCUDAHandlerAccessor::getPTX(handler,program); } -ptx_and_nvrtcResult_t CCUDAHandlerAccessor::compileDirectlyToPTX( +SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX( CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount, const char* const* headerContents, const char* const* includeNames, - std::string* log) + std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames) { nvrtcProgram program = nullptr; nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; auto cleanup = core::makeRAIIExiter([&]() -> void { - if (result!=NVRTC_SUCCESS && program) + if (program) SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); }); From a640183dbc6229f3b9b60c1d22bb1c50c7b8e5fe Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 17:05:26 +0200 Subject: [PATCH 107/149] Trim CUDA interop API surface --- cmake/common.cmake | 19 +++------------- include/nbl/core/decl/smart_refctd_ptr.h | 1 + .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 4 ++-- include/nbl/video/CCUDADevice.h | 2 -- include/nbl/video/CCUDAHandler.h | 12 ++++++---- src/nbl/ext/CUDAInterop/README.md | 11 +++++++--- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 5 ++++- src/nbl/video/CCUDAHandler.cpp | 22 ++++--------------- 8 files changed, 30 insertions(+), 46 deletions(-) diff --git a/cmake/common.cmake b/cmake/common.cmake index ae2264fda4..c50e1f6fb2 100755 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -284,22 +284,9 @@ function(nbl_install_dir _DIR) endfunction() function(nbl_install_lib_spec _TARGETS _RELATIVE_DESTINATION) - cmake_parse_arguments(_NBL_INSTALL_LIB "" "EXPORT" "" ${ARGN}) - if(_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS) - message(FATAL_ERROR "Unexpected arguments for nbl_install_lib_spec: ${_NBL_INSTALL_LIB_UNPARSED_ARGUMENTS}") - endif() - - if(_NBL_INSTALL_LIB_EXPORT) - install(TARGETS ${_TARGETS} - EXPORT ${_NBL_INSTALL_LIB_EXPORT} - ARCHIVE DESTINATION ${_NBL_CPACK_PACKAGE_RELATIVE_ENTRY_}/lib/${_RELATIVE_DESTINATION} - COMPONENT Libraries - ) - else() - install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries) - install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries) - install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries) - endif() + install(TARGETS ${_TARGETS} ARCHIVE DESTINATION lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Release COMPONENT Libraries) + install(TARGETS ${_TARGETS} ARCHIVE DESTINATION debug/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS Debug COMPONENT Libraries) + install(TARGETS ${_TARGETS} ARCHIVE DESTINATION relwithdebinfo/lib/${_RELATIVE_DESTINATION} CONFIGURATIONS RelWithDebInfo COMPONENT Libraries) endfunction() function(nbl_install_lib _TARGETS) diff --git a/include/nbl/core/decl/smart_refctd_ptr.h b/include/nbl/core/decl/smart_refctd_ptr.h index 814c807a84..7c231fea4b 100644 --- a/include/nbl/core/decl/smart_refctd_ptr.h +++ b/include/nbl/core/decl/smart_refctd_ptr.h @@ -118,6 +118,7 @@ class smart_refctd_ptr }; static_assert(sizeof(smart_refctd_ptr) == sizeof(IReferenceCounted*), "smart_refctd_ptr has a memory overhead!"); + template< class T, class... Args > smart_refctd_ptr make_smart_refctd_ptr(Args&& ... args); diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index d409c774e1..daf3dcb4d1 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -161,8 +161,8 @@ struct SPTXResult nvrtcResult result; }; -// These are opt-in CUDA-native declarations for symbols implemented and exported by Nabla. -// Only consumers that include this header and link Nabla::ext::CUDAInterop see CUDA SDK types. +// Opt-in native CUDA API. The declarations below are implemented by the Nabla library. +// This header is intentionally the only public path that includes CUDA SDK types. class NBL_API2 CCUDAHandlerAccessor { public: diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index bc1931e363..7c1d1f272b 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -89,8 +89,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted struct SNativeState; CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); - static constexpr auto CudaMemoryLocationCount = 5; - const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index f6b5d578a8..bb2d12c637 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -33,12 +33,17 @@ inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.js struct SRuntimeCompileEnvironment { core::vector includeDirs; - core::vector runtimePathFiles; }; NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs = {}); NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles); -NBL_API2 core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment); +inline core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment) +{ + core::vector options; + for (const auto& includeDir : environment.includeDirs) + options.push_back("-I" + includeDir.generic_string()); + return options; +} } class NBL_API2 CCUDAHandler : public core::IReferenceCounted @@ -73,7 +78,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted friend struct cuda_native::SAccess; struct SNativeState; - CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger); std::unique_ptr m_native; core::vector m_availableDevices; @@ -82,7 +87,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted core::vector m_headerNamesStorage; core::vector m_headerNames; system::logger_opt_smart_ptr m_logger; - int m_version; }; } diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 7d350da379..fb9896e30e 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -93,11 +93,12 @@ Smoke examples: - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. - `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI. - Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs. +- Runtime include-option construction is header-only and is not part of the exported ABI. - A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime. ## Runtime Header Discovery -NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. +NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a default `Nabla::Nabla` package requirement. - `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop. - The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths. @@ -105,7 +106,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. -- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options and caches the default discovery result after first use. +- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call. Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. @@ -114,7 +115,11 @@ Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows red - https://docs.nvidia.com/cuda/eula/#distribution - https://docs.nvidia.com/cuda/eula/#attachment-a -Attachment A includes header groups relevant to NVRTC runtime compilation, including `nvrtc.h`, `cuda_fp16.h`, `cuda_bf16.h`, `cuda_fp8.h`, `cuda_fp6.h`, `cuda_fp4.h`, `cuda_runtime_api.h`, `cuda.h`, `vector_functions.h`, and `vector_types.h`. +Attachment A lists header groups relevant to NVRTC runtime compilation: + +- NVIDIA Runtime Compilation Library and Header: `nvrtc.h` +- CUDA Floating Point Type Headers: `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` +- CUDA Headers for Runtime Compilation: `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, and `vector_types.h` CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs: diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index ace1059215..5d35ec8bed 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -53,11 +53,13 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) return false; CUcontext poppedContext = nullptr; + bool contextPushed = false; auto releaseContext = [&]() { if (context) { - cuda.pcuCtxPopCurrent_v2(&poppedContext); + if (contextPushed) + cuda.pcuCtxPopCurrent_v2(&poppedContext); cuda.pcuDevicePrimaryCtxRelease_v2(device); } }; @@ -67,6 +69,7 @@ bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) releaseContext(); return false; } + contextPushed = true; constexpr std::array input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u}; std::array output = {}; diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 9db99e7642..22ed5d0eb3 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -3,7 +3,6 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CUDAInterop.h" -#include "nbl/system/ModuleLookupUtils.h" #include "nlohmann/json.hpp" @@ -253,11 +252,10 @@ void appendSystemIncludeDirs(core::vector& includeDirs) SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles) { SRuntimeCompileEnvironment environment; - environment.runtimePathFiles = std::move(runtimePathFiles); for (auto& includeDir : explicitIncludeDirs) appendIncludeDir(environment.includeDirs,std::move(includeDir)); - appendRuntimePathsConfigs(environment.includeDirs,environment.runtimePathFiles); + appendRuntimePathsConfigs(environment.includeDirs,runtimePathFiles); appendAppLocalIncludeDirs(environment.includeDirs); appendEnvironmentIncludeDirs(environment.includeDirs); appendSystemIncludeDirs(environment.includeDirs); @@ -270,14 +268,6 @@ SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment) -{ - core::vector options; - for (const auto& includeDir : environment.includeDirs) - options.push_back("-I" + includeDir.generic_string()); - return options; -} - } #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -307,12 +297,10 @@ int cudaVersionMinor(int version) CCUDAHandler::CCUDAHandler( std::unique_ptr&& nativeState, core::vector>&& _headers, - core::smart_refctd_ptr&& _logger, - int _version) + core::smart_refctd_ptr&& _logger) : m_native(std::move(nativeState)) , m_headers(std::move(_headers)) , m_logger(std::move(_logger)) - , m_version(_version) { assert(m_native); @@ -866,7 +854,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste } return core::smart_refctd_ptr( - new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger),cudaVersion), + new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger)), core::dont_grab ); } @@ -1097,12 +1085,10 @@ struct CCUDAHandler::SNativeState {}; CCUDAHandler::CCUDAHandler( std::unique_ptr&& nativeState, core::vector>&& _headers, - core::smart_refctd_ptr&& _logger, - int _version) + core::smart_refctd_ptr&& _logger) : m_native(std::move(nativeState)) , m_headers(std::move(_headers)) , m_logger(std::move(_logger)) - , m_version(_version) { assert(m_native); } From 5bf0e2d9c70280851f6779ce6a25b853f1730829 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 17:24:10 +0200 Subject: [PATCH 108/149] Keep CUDA SDK layouts private --- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 1 - src/nbl/ext/CUDAInterop/README.md | 7 +++++-- src/nbl/video/CCUDAHandler.cpp | 18 +++++++++--------- src/nbl/video/CUDAInteropNativeState.hpp | 7 +++++++ 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index daf3dcb4d1..6d142c6b3f 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -145,7 +145,6 @@ struct SCUDADeviceInfo { CUdevice handle = {}; CUuuid uuid = {}; - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; }; struct SExportableMemoryCreationParams diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index fb9896e30e..d60b15639a 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -44,6 +44,8 @@ Consumers can also choose the SDK used for native compilation with: cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT= ``` +This affects native opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI. + ## Native Usage ```cpp @@ -92,9 +94,10 @@ Smoke examples: - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. - `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI. -- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small Nabla-owned parameter structs. +- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small fixed-layout parameter/result structs. +- SDK-sized arrays and other layouts derived from CUDA SDK constants stay private to Nabla. A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla. - Runtime include-option construction is header-only and is not part of the exported ABI. -- A package built with one compatible CUDA SDK can be consumed by native interop code built with another compatible SDK without rebuilding Nabla. The loaded driver and NVRTC runtime are still validated at runtime. +- The loaded CUDA driver and NVRTC runtime are validated at runtime. ## Runtime Header Discovery diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 22ed5d0eb3..78434d9bd5 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -325,15 +325,15 @@ CCUDAHandler::CCUDAHandler( if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) continue; - auto& nativeDevice = m_native->availableDevices.emplace_back(); - nativeDevice.handle = handle; - nativeDevice.uuid = uuid; + auto& nativeDevice = m_native->deviceStates.emplace_back(); + nativeDevice.info.handle = handle; + nativeDevice.info.uuid = uuid; + m_native->availableDevices.push_back(nativeDevice.info); auto& cleanDevice = m_availableDevices.emplace_back(); memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size()); - int* attributes = nativeDevice.attributes; - for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) - m_native->cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + for (size_t i = 0; i < nativeDevice.attributes.size(); i++) + m_native->cuda.pcuDeviceGetAttribute(&nativeDevice.attributes[i], static_cast(i), handle); } } @@ -979,9 +979,9 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - for (const auto& device : m_native->availableDevices) + for (const auto& device : m_native->deviceStates) { - if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.info.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; @@ -1064,7 +1064,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct continue; return core::smart_refctd_ptr( - new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.handle),core::smart_refctd_ptr(this)), + new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.info.handle),core::smart_refctd_ptr(this)), core::dont_grab ); } diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp index 7e602bb0f3..4be8178aa2 100644 --- a/src/nbl/video/CUDAInteropNativeState.hpp +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -10,9 +10,16 @@ namespace nbl::video struct CCUDAHandler::SNativeState { + struct SDeviceState + { + cuda_native::SCUDADeviceInfo info = {}; + std::array attributes = {}; + }; + cuda_native::CUDA cuda; cuda_native::NVRTC nvrtc; core::vector availableDevices; + core::vector deviceStates; SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc) : cuda(std::move(_cuda)) From d745421cc25114adf5664fb778e873db8e8f5c7a Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 17:51:35 +0200 Subject: [PATCH 109/149] Simplify CUDA interop helper --- cmake/NablaCUDAInteropHelpers.cmake | 190 +++------------------------- src/nbl/ext/CUDAInterop/README.md | 9 +- 2 files changed, 26 insertions(+), 173 deletions(-) diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake index 9c1ac657d4..e84b2d1a8e 100644 --- a/cmake/NablaCUDAInteropHelpers.cmake +++ b/cmake/NablaCUDAInteropHelpers.cmake @@ -1,182 +1,28 @@ -function(_nbl_cuda_interop_collect_runtime_include_dirs _OUT_INCLUDE_DIRS) - set(_include_dirs ${ARGN}) - - if(DEFINED CUDAToolkit_INCLUDE_DIRS AND NOT "${CUDAToolkit_INCLUDE_DIRS}" STREQUAL "") - list(APPEND _include_dirs ${CUDAToolkit_INCLUDE_DIRS}) +function(nbl_target_link_cuda_interop TARGET_NAME SCOPE) + if(NOT SCOPE MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$") + set(SCOPE PRIVATE) endif() - - if(TARGET CUDA::toolkit) - get_target_property(_cuda_toolkit_include_dirs CUDA::toolkit INTERFACE_INCLUDE_DIRECTORIES) - if(_cuda_toolkit_include_dirs AND NOT _cuda_toolkit_include_dirs STREQUAL "NOTFOUND") - list(APPEND _include_dirs ${_cuda_toolkit_include_dirs}) - endif() - endif() - - if(_include_dirs) - list(REMOVE_DUPLICATES _include_dirs) - endif() - - set(${_OUT_INCLUDE_DIRS} ${_include_dirs} PARENT_SCOPE) -endfunction() - -function(_nbl_cuda_interop_make_runtime_paths_json _OUT_CONTENT) - set(_include_dirs ${ARGN}) - set(_cuda_runtime_include_dir_entries "") - - foreach(_include_dir IN LISTS _include_dirs) - if("${_include_dir}" STREQUAL "") - continue() + cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN}) + target_link_libraries("${TARGET_NAME}" ${SCOPE} Nabla::ext::CUDAInterop) + set(_include_dir_entries "") + foreach(_include_dir IN LISTS _NBL_CUDA_INTEROP_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) + if(_include_dir) + file(TO_CMAKE_PATH "${_include_dir}" _include_dir) + list(APPEND _include_dir_entries " \"${_include_dir}\"") endif() - - file(TO_CMAKE_PATH "${_include_dir}" _include_dir_json) - string(REPLACE "\"" "\\\"" _include_dir_json "${_include_dir_json}") - - list(APPEND _cuda_runtime_include_dir_entries " \"${_include_dir_json}\"") endforeach() - - set(_json_entry_separator [=[ -, -]=]) - list(JOIN _cuda_runtime_include_dir_entries "${_json_entry_separator}" _cuda_runtime_include_dirs) - - set(_json [=[ + list(JOIN _include_dir_entries "," _include_dirs_json) + set(_runtime_json [=[ { "cudaRuntimeIncludeDirs": [ -@_cuda_runtime_include_dirs@ +@_include_dirs_json@ ] } ]=]) - string(CONFIGURE "${_json}" _json @ONLY) - set(${_OUT_CONTENT} "${_json}" PARENT_SCOPE) -endfunction() - -function(_nbl_cuda_interop_collect_configs _OUT_CONFIGS) - if(CMAKE_CONFIGURATION_TYPES) - set(_configs ${CMAKE_CONFIGURATION_TYPES}) - elseif(CMAKE_BUILD_TYPE) - set(_configs "${CMAKE_BUILD_TYPE}") - else() - set(_configs Debug) - endif() - - list(REMOVE_DUPLICATES _configs) - set(${_OUT_CONFIGS} ${_configs} PARENT_SCOPE) -endfunction() - -function(_nbl_cuda_interop_collect_target_runtime_jsons TARGET_NAME _OUT_FILES _OVERRIDE_OUTPUT) - _nbl_cuda_interop_collect_configs(_configs) - set(_runtime_jsons "") - - if(NOT "${_OVERRIDE_OUTPUT}" STREQUAL "") - foreach(_config IN LISTS _configs) - set(_runtime_paths_json "${_OVERRIDE_OUTPUT}") - string(REPLACE "$" "${_config}" _runtime_paths_json "${_runtime_paths_json}") - if(_runtime_paths_json MATCHES "\\$<") - message(FATAL_ERROR "Nabla: CUDA interop runtime JSON path supports only plain paths or $.") - endif() - cmake_path(IS_ABSOLUTE _runtime_paths_json _is_abs) - if(NOT _is_abs) - cmake_path(ABSOLUTE_PATH _runtime_paths_json BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_paths_json) - endif() - cmake_path(NORMAL_PATH _runtime_paths_json OUTPUT_VARIABLE _runtime_paths_json) - list(APPEND _runtime_jsons "${_runtime_paths_json}") - endforeach() - list(REMOVE_DUPLICATES _runtime_jsons) - set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE) - return() - endif() - - foreach(_config IN LISTS _configs) - string(TOUPPER "${_config}" _config_upper) - get_target_property(_runtime_output_dir "${TARGET_NAME}" "RUNTIME_OUTPUT_DIRECTORY_${_config_upper}") - - if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") - get_target_property(_runtime_output_dir "${TARGET_NAME}" RUNTIME_OUTPUT_DIRECTORY) - endif() - if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}) - set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY_${_config_upper}}") - endif() - if((NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") AND DEFINED CMAKE_RUNTIME_OUTPUT_DIRECTORY) - set(_runtime_output_dir "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}") - endif() - if(NOT _runtime_output_dir OR _runtime_output_dir STREQUAL "NOTFOUND") - if(CMAKE_CONFIGURATION_TYPES) - set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}/${_config}") - else() - set(_runtime_output_dir "${CMAKE_CURRENT_BINARY_DIR}") - endif() - endif() - - string(REPLACE "$" "${_config}" _runtime_output_dir "${_runtime_output_dir}") - if(_runtime_output_dir MATCHES "\\$<") - message(FATAL_ERROR "Nabla: nbl_configure_cuda_interop_runtime supports only plain runtime output directories or $.") - endif() - - cmake_path(IS_ABSOLUTE _runtime_output_dir _is_abs) - if(NOT _is_abs) - cmake_path(ABSOLUTE_PATH _runtime_output_dir BASE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" OUTPUT_VARIABLE _runtime_output_dir) - endif() - cmake_path(NORMAL_PATH _runtime_output_dir OUTPUT_VARIABLE _runtime_output_dir) - - list(APPEND _runtime_jsons "${_runtime_output_dir}/nbl_cuda_interop_runtime.json") - endforeach() - - list(REMOVE_DUPLICATES _runtime_jsons) - set(${_OUT_FILES} ${_runtime_jsons} PARENT_SCOPE) -endfunction() - -function(nbl_configure_cuda_interop_runtime TARGET_NAME) - cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN}) - - if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS) - message(FATAL_ERROR "Nabla: unexpected arguments for nbl_configure_cuda_interop_runtime: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}") - endif() - - if(NOT TARGET "${TARGET_NAME}") - message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist") - endif() - - _nbl_cuda_interop_collect_runtime_include_dirs(_include_dirs ${_NBL_CUDA_INTEROP_INCLUDE_DIRS}) - - _nbl_cuda_interop_make_runtime_paths_json(_runtime_paths_json_content ${_include_dirs}) - _nbl_cuda_interop_collect_target_runtime_jsons("${TARGET_NAME}" _runtime_paths_jsons "${_NBL_CUDA_INTEROP_RUNTIME_JSON}") - - foreach(_runtime_paths_json IN LISTS _runtime_paths_jsons) - file(GENERATE OUTPUT "${_runtime_paths_json}" CONTENT "${_runtime_paths_json_content}" TARGET "${TARGET_NAME}") - endforeach() - - set_source_files_properties(${_runtime_paths_jsons} PROPERTIES GENERATED TRUE HEADER_FILE_ONLY TRUE) - target_sources("${TARGET_NAME}" PRIVATE ${_runtime_paths_jsons}) -endfunction() - -function(nbl_target_link_cuda_interop TARGET_NAME) - set(_args ${ARGN}) - set(_scope PRIVATE) - - if(_args) - list(GET _args 0 _first_arg) - if(_first_arg MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$") - set(_scope "${_first_arg}") - list(REMOVE_AT _args 0) - endif() - endif() - - cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${_args}) - - if(_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS) - message(FATAL_ERROR "Nabla: unexpected arguments for nbl_target_link_cuda_interop: ${_NBL_CUDA_INTEROP_UNPARSED_ARGUMENTS}") - endif() - - if(NOT TARGET "${TARGET_NAME}") - message(FATAL_ERROR "Nabla: target \"${TARGET_NAME}\" does not exist") - endif() - if(NOT TARGET Nabla::ext::CUDAInterop) - message(FATAL_ERROR "Nabla: Nabla::ext::CUDAInterop is not available. Request the CUDAInterop package component or enable NBL_COMPILE_WITH_CUDA.") + string(CONFIGURE "${_runtime_json}" _runtime_json @ONLY) + set(_runtime_json_path "$/nbl_cuda_interop_runtime.json") + if(_NBL_CUDA_INTEROP_RUNTIME_JSON) + set(_runtime_json_path "${_NBL_CUDA_INTEROP_RUNTIME_JSON}") endif() - - target_link_libraries("${TARGET_NAME}" ${_scope} Nabla::ext::CUDAInterop) - nbl_configure_cuda_interop_runtime("${TARGET_NAME}" - RUNTIME_JSON "${_NBL_CUDA_INTEROP_RUNTIME_JSON}" - INCLUDE_DIRS ${_NBL_CUDA_INTEROP_INCLUDE_DIRS} - ) + file(GENERATE OUTPUT "${_runtime_json_path}" CONTENT "${_runtime_json}" TARGET "${TARGET_NAME}") endfunction() diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index d60b15639a..2ce46cbc93 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -113,11 +113,15 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. -Nabla does not ship CUDA runtime headers by default. NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See: +Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or native CUDA code. + +NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See: - https://docs.nvidia.com/cuda/eula/#distribution - https://docs.nvidia.com/cuda/eula/#attachment-a +This means the Attachment A header groups below can be redistributed with applications under the EULA terms. It does not mean the full CUDA SDK can be redistributed. Applications that need NVRTC runtime compilation can decide whether to ship the allowed headers, depend on an official runtime/header package, or point discovery at an installed toolkit/header package. + Attachment A lists header groups relevant to NVRTC runtime compilation: - NVIDIA Runtime Compilation Library and Header: `nvrtc.h` @@ -144,3 +148,6 @@ The split follows the same boundary pattern used by mature GPU projects: default - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61 - Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27 - Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30 +- OpenMM keeps the CUDA platform boundary on OpenMM types/properties in `CudaPlatform.h`, while `CudaContext.h` is the CUDA-specific low-level header that includes CUDA SDK headers and exposes `CUmodule` / `CUfunction`: https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaPlatform.h#L48-L120 and https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaContext.h#L32-L52 +- GROMACS gates CUDA source handling behind `GMX_GPU_CUDA` in the library build and keeps CUDA runtime types in internal GPU utility headers: https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/CMakeLists.txt#L339-L367 and https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gpu_utils/gputraits.cuh#L44-L58 +- ONNX Runtime keeps the public C API provider-neutral and routes CUDA through provider-specific bridge/factory code: https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_c_api.h#L1-L80 and https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/session/provider_bridge_ort.cc#L110-L150 From ffba3d48d4ac5fd7f26ed324c310f338328572af Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Thu, 7 May 2026 18:12:36 +0200 Subject: [PATCH 110/149] Update CUDA interop examples pointer --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 3c57a88af9..7b5817a6d4 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 3c57a88af9eba722fcc6b5b5ba3d136ab3e166ca +Subproject commit 7b5817a6d45c62a70fbe617022b6026a83939ff5 From 745f1b9166c457f538c67587cc9965280338884d Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Fri, 8 May 2026 17:07:54 +0200 Subject: [PATCH 111/149] Use opaque CUDA interop boundary --- examples_tests | 2 +- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 114 ++++++++++++------ include/nbl/video/CCUDADevice.h | 13 ++ include/nbl/video/CCUDAExportableMemory.h | 2 + include/nbl/video/CCUDAHandler.h | 1 + include/nbl/video/CCUDAImportedMemory.h | 3 + include/nbl/video/CCUDAImportedSemaphore.h | 2 + include/nbl/video/CUDAInterop.h | 1 + include/nbl/video/CUDAInteropHandles.h | 40 ++++++ src/nbl/ext/CUDAInterop/README.md | 49 ++++---- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 22 ++-- src/nbl/video/CCUDADevice.cpp | 99 ++++++++------- src/nbl/video/CCUDAExportableMemory.cpp | 19 +-- src/nbl/video/CCUDAHandler.cpp | 42 ++++--- src/nbl/video/CCUDAImportedMemory.cpp | 39 ++++-- src/nbl/video/CCUDAImportedSemaphore.cpp | 18 +-- 16 files changed, 309 insertions(+), 157 deletions(-) create mode 100644 include/nbl/video/CUDAInteropHandles.h diff --git a/examples_tests b/examples_tests index 7b5817a6d4..2d415af102 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 7b5817a6d45c62a70fbe617022b6026a83939ff5 +Subproject commit 2d415af102ebf710ea2bb369b3f0eca5544652f7 diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 6d142c6b3f..495f3cabc0 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -13,6 +13,7 @@ #include "cuda.h" #include "nvrtc.h" +#include #if CUDA_VERSION < 13000 #error "Need CUDA 13.0 SDK or higher." #endif @@ -160,54 +161,91 @@ struct SPTXResult nvrtcResult result; }; -// Opt-in native CUDA API. The declarations below are implemented by the Nabla library. -// This header is intentionally the only public path that includes CUDA SDK types. -class NBL_API2 CCUDAHandlerAccessor +template +concept cuda_opaque_handle = + std::is_trivially_copyable_v && + std::is_trivially_copyable_v && + sizeof(Opaque)==sizeof(Native) && + alignof(Opaque)==alignof(Native); + +template +struct SOpaqueCUDAType; + +template<> struct SOpaqueCUDAType { using type = CUdevice; }; +template<> struct SOpaqueCUDAType { using type = CUcontext; }; +template<> struct SOpaqueCUDAType { using type = CUdeviceptr; }; +template<> struct SOpaqueCUDAType { using type = CUexternalMemory; }; +template<> struct SOpaqueCUDAType { using type = CUexternalSemaphore; }; + +template +struct SNativeHandle { - public: - static const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); - static const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); - static bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); - static bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); - static const core::vector& getAvailableDevices(const CCUDAHandler& handler); - static nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - static nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); - static nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); - static SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); - static SPTXResult compileDirectlyToPTX( - CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr - ); + using cuda_t = typename SOpaqueCUDAType::type; + static_assert(cuda_opaque_handle); + + SNativeHandle() = default; + SNativeHandle(const SNativeHandle&) = default; + SNativeHandle(const cuda_t& native) { operator=(native); } + SNativeHandle(const Opaque& opaque) { operator=(opaque); } + + SNativeHandle& operator=(const SNativeHandle&) = default; + SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; } + SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; } + + operator cuda_t&() { return value; } + operator const cuda_t&() const { return value; } + operator Opaque&() { return reinterpret_cast(value); } + operator const Opaque&() const { return reinterpret_cast(value); } + + Opaque* opaque() { return &static_cast(*this); } + const Opaque* opaque() const { return &static_cast(*this); } + Opaque asOpaque() const { return static_cast(*this); } + + cuda_t value = {}; }; -class NBL_API2 CCUDADeviceAccessor +using SCUdevice = SNativeHandle; +using SCUcontext = SNativeHandle; +using SCUdeviceptr = SNativeHandle; +using SCUexternalMemory = SNativeHandle; +using SCUexternalSemaphore = SNativeHandle; + +inline bool isBuildCUDAVersionCompatible() { - public: - static CUdevice getInternalObject(const CCUDADevice& device); - static CUcontext getContext(const CCUDADevice& device); - static size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size); - static core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params); -}; + const auto buildVersion = CCUDAHandler::getBuildCUDAVersion(); + return buildVersion==0u || buildVersion==CUDA_VERSION; +} -class NBL_API2 CCUDAExportableMemoryAccessor +inline bool isDeviceLocal(CUmemLocationType location) { - public: - static CUdeviceptr getDeviceptr(const CCUDAExportableMemory& memory); -}; + return location==CU_MEM_LOCATION_TYPE_DEVICE; +} + +// Opt-in native CUDA declarations. Nabla owns the definitions. +NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); +NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); +NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); +NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); +NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); +NBL_API2 const core::vector& getAvailableDevices(const CCUDAHandler& handler); +NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); +NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); +NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); +NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); +NBL_API2 SPTXResult compileDirectlyToPTX( + CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr +); -class NBL_API2 CCUDAImportedMemoryAccessor +inline size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size) { - public: - static CUexternalMemory getInternalObject(const CCUDAImportedMemory& memory); - static CUresult getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer); -}; + return device.roundToGranularity(static_cast(location),size); +} -class NBL_API2 CCUDAImportedSemaphoreAccessor +inline core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params) { - public: - static CUexternalSemaphore getInternalObject(const CCUDAImportedSemaphore& semaphore); -}; + return device.createExportableMemory({params.size,params.alignment,static_cast(params.location)}); +} } diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 7c1d1f272b..d4eb711cd2 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -5,6 +5,7 @@ #define _NBL_VIDEO_C_CUDA_DEVICE_H_ #include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" #include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDAImportedSemaphore.h" @@ -75,9 +76,21 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted } const CCUDAHandler* getHandler() const { return m_handler.get(); } + cuda_interop::SCUdevice getInternalObject() const; + cuda_interop::SCUcontext getContext() const; + + struct SExportableMemoryCreationParams + { + size_t size; + uint32_t alignment; + uint32_t locationType; + }; + + size_t roundToGranularity(uint32_t locationType, size_t size) const; bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } + core::smart_refctd_ptr createExportableMemory(SExportableMemoryCreationParams&& params); core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 6d29739408..6243bd8c73 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -5,6 +5,7 @@ #define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ #include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" #include #include @@ -32,6 +33,7 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted ~CCUDAExportableMemory() override; + cuda_interop::SCUdeviceptr getDeviceptr() const; core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; private: diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index bb2d12c637..2ce6541696 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -50,6 +50,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static uint32_t getBuildCUDAVersion(); inline core::SRange getSTDHeaders() { diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index ac41c110a2..454088b7ae 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -2,6 +2,7 @@ #define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ #include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" #include #include @@ -20,6 +21,8 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted { public: ~CCUDAImportedMemory() override; + cuda_interop::SCUexternalMemory getInternalObject() const; + bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const; private: friend class CCUDADevice; diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index c8bf77313e..5a4f28abde 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -5,6 +5,7 @@ #define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ #include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" #include #include @@ -23,6 +24,7 @@ class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { public: ~CCUDAImportedSemaphore() override; + cuda_interop::SCUexternalSemaphore getInternalObject() const; private: friend class CCUDADevice; diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h index 57e92ae647..efea886b96 100644 --- a/include/nbl/video/CUDAInterop.h +++ b/include/nbl/video/CUDAInterop.h @@ -4,6 +4,7 @@ #ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ #define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ +#include "nbl/video/CUDAInteropHandles.h" #include "nbl/video/CCUDADevice.h" #include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDAHandler.h" diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h new file mode 100644 index 0000000000..88401f6a1f --- /dev/null +++ b/include/nbl/video/CUDAInteropHandles.h @@ -0,0 +1,40 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ + +#include +#include + +namespace nbl::video::cuda_interop +{ + +struct alignas(alignof(int32_t)) SCUdevice +{ + uint8_t value[sizeof(int32_t)] = {}; +}; + +struct alignas(alignof(void*)) SCUcontext +{ + uint8_t value[sizeof(void*)] = {}; +}; + +struct alignas(alignof(uintptr_t)) SCUdeviceptr +{ + uint8_t value[sizeof(uintptr_t)] = {}; +}; + +struct alignas(alignof(void*)) SCUexternalMemory +{ + uint8_t value[sizeof(void*)] = {}; +}; + +struct alignas(alignof(void*)) SCUexternalSemaphore +{ + uint8_t value[sizeof(void*)] = {}; +}; + +} + +#endif diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 2ce46cbc93..fca288a98f 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -2,10 +2,11 @@ ## Layout -- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and its implementation in `src/nbl/video/CCUDA*.cpp`. -- Those headers do not include CUDA SDK headers. Consumers that only link `Nabla::Nabla` do not need `cuda.h`, `nvrtc.h`, or a CUDA SDK install just to parse Nabla headers. -- `Nabla::ext::CUDAInterop` is an `INTERFACE` target for native CUDA opt-in. It builds no library. It only adds `CUDAInteropNative.h`, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. -- `CUDAInteropNative.h` is the only public opt-in header that includes CUDA SDK headers and exposes `cuda_native::*Accessor` classes for CUDA Driver API and NVRTC types. +- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and the implementation in `src/nbl/video/CCUDA*.cpp`. +- The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers. +- CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`. +- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the native opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. +- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA native types with `cuda_native::SNativeHandle`. ## CMake Usage @@ -54,33 +55,39 @@ This affects native opt-in compilation and generated runtime header discovery on auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger)); auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice); -auto memory = nbl::video::cuda_native::CCUDADeviceAccessor::createExportableMemory(*cudaDevice, { +if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible()) + return false; + +auto memory = nbl::video::cuda_native::createExportableMemory(*cudaDevice, { .size = size, .alignment = alignment, .location = CU_MEM_LOCATION_TYPE_DEVICE, }); +nbl::video::cuda_native::SCUdeviceptr mapped; +if (importedMemory) + importedMemory->getMappedBuffer(mapped.opaque()); + +CUdeviceptr rawMapped = mapped; +CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr()); + std::string log; -std::string cudaSource = loadKernelText(); -auto compile = nbl::video::cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( +auto compile = nbl::video::cuda_native::compileDirectlyToPTX( *handler, std::move(cudaSource), "kernel.cu", cudaDevice->geDefaultCompileOptions(), - log, - 0, - nullptr, - nullptr + log ); ``` -Native access is not wrapped away. Opt-in code uses CUDA Driver API and NVRTC types directly through accessor classes: +Native access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop: -- `CCUDAHandlerAccessor` exposes CUDA/NVRTC function tables, NVRTC program helpers, PTX compilation, native device enumeration, and default error handling. -- `CCUDADeviceAccessor` exposes `CUdevice`, `CUcontext`, memory granularity, and CUDA allocation creation. -- `CCUDAExportableMemoryAccessor`, `CCUDAImportedMemoryAccessor`, and `CCUDAImportedSemaphoreAccessor` expose the raw CUDA handles needed for interop. -- Accessor methods take explicit Nabla references. Callers dereference `smart_refctd_ptr` at the call site instead of going through pointer/smart-pointer convenience overloads. -- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. There is no optional output pointer in the public API. +- `cuda_native::getCUDAFunctionTable` and `cuda_native::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables. +- `cuda_native::SNativeHandle` converts between SDK-free Nabla opaque handles and CUDA native handles such as `CUdeviceptr`. +- `cuda_native::createExportableMemory` and `cuda_native::roundToGranularity` keep CUDA enum usage in the opt-in header while Nabla stores only integer/opaque data in its public ABI. +- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. Native opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`. +- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. Smoke examples: @@ -92,10 +99,10 @@ Smoke examples: - `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI. - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. +- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA native handle storage. The native opt-in header validates this with `static_assert`s against the SDK used by the consumer. - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. -- `CUDAInteropNative.h` declares exported accessor classes whose definitions still live in `Nabla.dll`. The opt-in header owns only the CUDA SDK surface. Nabla owns the implementation and ABI. -- Native opt-in ABI uses CUDA Driver API handles/enums such as `CUdevice`, `CUcontext`, `CUdeviceptr`, `CUexternalMemory`, and `CUexternalSemaphore`, plus small fixed-layout parameter/result structs. -- SDK-sized arrays and other layouts derived from CUDA SDK constants stay private to Nabla. A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla. +- SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla. +- A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla. Leaky/native code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required. - Runtime include-option construction is header-only and is not part of the exported ABI. - The loaded CUDA driver and NVRTC runtime are validated at runtime. @@ -109,7 +116,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. -- `cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call. +- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call. Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 5d35ec8bed..52fdb43539 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -23,7 +23,7 @@ using namespace nbl::video; core::smart_refctd_ptr vulkanMemory, core::smart_refctd_ptr vulkanSemaphore) { - auto cudaMemory = cuda_native::CCUDADeviceAccessor::createExportableMemory(cudaDevice, { + auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, { .size = 4096, .alignment = 4096, .location = CU_MEM_LOCATION_TYPE_DEVICE, @@ -35,18 +35,20 @@ using namespace nbl::video; auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory)); auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore)); - CUdeviceptr mappedVulkanMemory = 0; + cuda_native::SCUdeviceptr mappedVulkanMemory; if (importedFromVulkan) - cuda_native::CCUDAImportedMemoryAccessor::getMappedBuffer(*importedFromVulkan,&mappedVulkanMemory); + importedFromVulkan->getMappedBuffer(mappedVulkanMemory.opaque()); - const CUdeviceptr cudaDevicePtr = cuda_native::CCUDAExportableMemoryAccessor::getDeviceptr(*cudaMemory); - const CUexternalSemaphore cudaSemaphore = importedSemaphore ? cuda_native::CCUDAImportedSemaphoreAccessor::getInternalObject(*importedSemaphore):nullptr; + const CUdeviceptr cudaDevicePtr = cuda_native::SCUdeviceptr(cudaMemory->getDeviceptr()); + CUexternalSemaphore cudaSemaphore = nullptr; + if (importedSemaphore) + cudaSemaphore = cuda_native::SCUexternalSemaphore(importedSemaphore->getInternalObject()); return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; } bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) { - auto& cuda = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(handler); + auto& cuda = cuda_native::getCUDAFunctionTable(handler); CUcontext context = nullptr; if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) @@ -98,7 +100,7 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) )cuda"; std::string log; - auto compile = cuda_native::CCUDAHandlerAccessor::compileDirectlyToPTX( + auto compile = cuda_native::compileDirectlyToPTX( handler, std::string(Source), "cuda_fp16_discovery_probe.cu", @@ -124,7 +126,9 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!isAPILoaded()) return false; - static_assert(std::is_same_v())), CUdevice>); + static_assert(std::is_same_v); + if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible()) + return false; #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}); @@ -147,7 +151,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!cudaFp16HeaderCompileProbe(*handler)) return false; - const auto& devices = nbl::video::cuda_native::CCUDAHandlerAccessor::getAvailableDevices(*handler); + const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler); if (devices.empty()) return true; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 359cd093a1..2ed02a6282 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -34,11 +34,11 @@ CCUDADevice::CCUDADevice( m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); + const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) + if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) assert(false); - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context))) + if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context))) assert(false); for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType) @@ -58,32 +58,31 @@ CCUDADevice::CCUDADevice( .win32HandleMetaData = &metadata, #endif }; - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) + if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) assert(false); } } -namespace cuda_native +cuda_interop::SCUdevice CCUDADevice::getInternalObject() const { - -CUdevice CCUDADeviceAccessor::getInternalObject(const CCUDADevice& device) -{ - return SAccess::native(device).handle; + return cuda_native::SCUdevice(cuda_native::SAccess::native(*this).handle).asOpaque(); } -CUcontext CCUDADeviceAccessor::getContext(const CCUDADevice& device) +cuda_interop::SCUcontext CCUDADevice::getContext() const { - return SAccess::native(device).context; + return cuda_native::SCUcontext(cuda_native::SAccess::native(*this).context).asOpaque(); } -size_t CCUDADeviceAccessor::roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size) +size_t CCUDADevice::roundToGranularity(uint32_t locationType, size_t size) const { - const auto& granularity = SAccess::native(device).allocationGranularity[location]; + if (locationType>=m_native->allocationGranularity.size()) + return 0u; + const auto& granularity = m_native->allocationGranularity[locationType]; + if (granularity==0u) + return 0u; return ((size - 1) / granularity + 1) * granularity; } -} - static bool isDeviceLocal(CUmemLocationType location) { return location==CU_MEM_LOCATION_TYPE_DEVICE; @@ -93,7 +92,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept { const auto handler = device.getHandler(); const auto& native = cuda_native::SAccess::native(device); - const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*handler); + const auto& cu = cuda_native::getCUDAFunctionTable(*handler); CUdeviceptr ptr = 0; if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) @@ -101,7 +100,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) + if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) assert(false); return err; } @@ -113,9 +112,9 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size))) + if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size))) assert(false); - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) + if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) assert(false); return err; } @@ -125,23 +124,23 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept return CUDA_SUCCESS; } -namespace cuda_native -{ - -core::smart_refctd_ptr CCUDADeviceAccessor::createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& inParams) +core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&& inParams) { - const auto handler = device.getHandler(); - auto& native = SAccess::native(device); - auto logger = SAccess::logger(device); + const auto handler = getHandler(); + auto& native = cuda_native::SAccess::native(*this); + auto logger = cuda_native::SAccess::logger(*this); + const auto location = static_cast(inParams.locationType); CCUDAExportableMemory::SCachedCreationParams params = { .size = inParams.size, .alignment = inParams.alignment, - .granularSize = CCUDADeviceAccessor::roundToGranularity(device, inParams.location, inParams.size), - .deviceLocal = isDeviceLocal(inParams.location) + .granularSize = roundToGranularity(inParams.locationType, inParams.size), + .deviceLocal = isDeviceLocal(location) }; + if (params.granularSize==0u) + return nullptr; - auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*handler); + auto& cu = cuda_native::getCUDAFunctionTable(*handler); #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { @@ -151,14 +150,14 @@ core::smart_refctd_ptr CCUDADeviceAccessor::createExporta const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = SAccess::allocationHandleType(), - .location = { .type = inParams.location, .id = native.handle }, + .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(), + .location = { .type = location, .id = native.handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - auto nativeState = SAccess::makeExportableMemoryNativeState(); + auto nativeState = cuda_native::SAccess::makeExportableMemoryNativeState(); CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) @@ -170,16 +169,16 @@ core::smart_refctd_ptr CCUDADeviceAccessor::createExporta if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); - if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) + if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) assert(false); return nullptr; } - if (const auto err = reserveAddressAndMapMemory(device,&SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, inParams.location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(*this,&cuda_native::SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err) { logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - if (!CCUDAHandlerAccessor::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) + if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) assert(false); bool closeSucceed = CloseExternalHandle(params.externalHandle); @@ -195,14 +194,12 @@ core::smart_refctd_ptr CCUDADeviceAccessor::createExporta return nullptr; } - return SAccess::makeExportableMemory(core::smart_refctd_ptr(&device),std::move(params),std::move(nativeState)); -} - + return cuda_native::SAccess::makeExportableMemory(core::smart_refctd_ptr(this),std::move(params),std::move(nativeState)); } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); + const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); const auto handleType = mem->getCreationParams().externalHandleType; if (!handleType) return nullptr; @@ -233,7 +230,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler); + auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); auto handleType = sema->getCreationParams().externalHandleTypes.value; if (!handleType) @@ -266,7 +263,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CCUDADevice::~CCUDADevice() { - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_handler, cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context))) + if (!cuda_native::defaultHandleResult(*m_handler, cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context))) assert(false); } @@ -298,6 +295,26 @@ CCUDADevice::CCUDADevice( CCUDADevice::~CCUDADevice() = default; +cuda_interop::SCUdevice CCUDADevice::getInternalObject() const +{ + return {}; +} + +cuda_interop::SCUcontext CCUDADevice::getContext() const +{ + return {}; +} + +size_t CCUDADevice::roundToGranularity(uint32_t, size_t) const +{ + return 0u; +} + +core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&&) +{ + return nullptr; +} + core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) { return nullptr; diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index f84169e38f..6c77736628 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -52,12 +52,12 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM CCUDAExportableMemory::~CCUDAExportableMemory() { - const auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); + const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize))) + if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize))) assert(false); - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize))) + if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize))) assert(false); bool closeSucceed = CloseExternalHandle(m_params.externalHandle); @@ -65,15 +65,11 @@ CCUDAExportableMemory::~CCUDAExportableMemory() } -namespace cuda_native +cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const { - -CUdeviceptr CCUDAExportableMemoryAccessor::getDeviceptr(const CCUDAExportableMemory& memory) -{ - return SAccess::native(memory).ptr; + return cuda_native::SCUdeviceptr(m_native->ptr).asOpaque(); } -} } #else @@ -102,6 +98,11 @@ core::smart_refctd_ptr CCUDAExportableMemory::create(core CCUDAExportableMemory::~CCUDAExportableMemory() = default; +cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const +{ + return {}; +} + core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const { return nullptr; diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 78434d9bd5..4168612e61 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -340,10 +340,15 @@ CCUDAHandler::CCUDAHandler( CCUDAHandler::~CCUDAHandler() = default; +uint32_t CCUDAHandler::getBuildCUDAVersion() +{ + return CUDA_VERSION; +} + namespace cuda_native { -bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) +bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) { @@ -709,12 +714,12 @@ bool CCUDAHandlerAccessor::defaultHandleResult(CUresult result, const system::lo return false; } -bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, CUresult result) +bool defaultHandleResult(const CCUDAHandler& handler, CUresult result) { - return CCUDAHandlerAccessor::defaultHandleResult(result,SAccess::logger(handler)); + return defaultHandleResult(result,SAccess::logger(handler)); } -bool CCUDAHandlerAccessor::defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) +bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) { switch (result) { @@ -862,22 +867,22 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste namespace cuda_native { -const CUDA& CCUDAHandlerAccessor::getCUDAFunctionTable(const CCUDAHandler& handler) +const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler) { return SAccess::native(handler).cuda; } -const NVRTC& CCUDAHandlerAccessor::getNVRTCFunctionTable(const CCUDAHandler& handler) +const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler) { return SAccess::native(handler).nvrtc; } -const core::vector& CCUDAHandlerAccessor::getAvailableDevices(const CCUDAHandler& handler) +const core::vector& getAvailableDevices(const CCUDAHandler& handler) { return SAccess::native(handler).availableDevices; } -nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -889,12 +894,12 @@ nvrtcResult CCUDAHandlerAccessor::createProgram(CCUDAHandler& handler, nvrtcProg return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); } -nvrtcResult CCUDAHandlerAccessor::compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) +nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) { return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); } -nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) +nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) { size_t _size = 0ull; nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size); @@ -907,7 +912,7 @@ nvrtcResult CCUDAHandlerAccessor::getProgramLog(const CCUDAHandler& handler, nvr return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data()); } -SPTXResult CCUDAHandlerAccessor::getPTX(const CCUDAHandler& handler, nvrtcProgram prog) +SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog) { size_t _size = 0ull; nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size); @@ -945,15 +950,15 @@ static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult r const auto* optionsBegin = options.empty() ? nullptr:options.data(); const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); - result = CCUDAHandlerAccessor::compileProgram(handler,program,{optionsBegin,optionsEnd}); - CCUDAHandlerAccessor::getProgramLog(handler,program,log); + result = compileProgram(handler,program,{optionsBegin,optionsEnd}); + getProgramLog(handler,program,log); if (result!=NVRTC_SUCCESS) return {nullptr,result}; - return CCUDAHandlerAccessor::getPTX(handler,program); + return getPTX(handler,program); } -SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX( +SPTXResult compileDirectlyToPTX( CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames) { @@ -965,7 +970,7 @@ SPTXResult CCUDAHandlerAccessor::compileDirectlyToPTX( SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); }); - result = CCUDAHandlerAccessor::createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); + result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log); } @@ -1095,6 +1100,11 @@ CCUDAHandler::CCUDAHandler( CCUDAHandler::~CCUDAHandler() = default; +uint32_t CCUDAHandler::getBuildCUDAVersion() +{ + return 0u; +} + core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) { return nullptr; diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 9145fe18ac..404323a365 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -18,31 +18,34 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev assert(m_native); } -namespace cuda_native +cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const { - -CUexternalMemory CCUDAImportedMemoryAccessor::getInternalObject(const CCUDAImportedMemory& memory) -{ - return SAccess::native(memory).handle; + return cuda_native::SCUexternalMemory(m_native->handle).asOpaque(); } -CUresult CCUDAImportedMemoryAccessor::getMappedBuffer(const CCUDAImportedMemory& memory, CUdeviceptr* mappedBuffer) +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const { + if (!mappedBuffer) + return false; + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; bufferDesc.offset = 0; - bufferDesc.size = SAccess::source(memory)->getAllocationSize(); + bufferDesc.size = m_src->getAllocationSize(); - const auto& cu = CCUDAHandlerAccessor::getCUDAFunctionTable(*SAccess::device(memory)->getHandler()); - return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, SAccess::native(memory).handle, &bufferDesc); + CUdeviceptr nativeMappedBuffer = 0; + const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc); + if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result)) + return false; -} - + *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer).asOpaque(); + return true; } CCUDAImportedMemory::~CCUDAImportedMemory() { - auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle))) + auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle))) assert(false); } @@ -66,6 +69,16 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev CCUDAImportedMemory::~CCUDAImportedMemory() = default; +cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const +{ + return {}; +} + +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr*) const +{ + return false; +} + } #endif diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 5d7d3e07ae..4fce78788c 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -17,20 +17,15 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptrhandle).asOpaque(); } CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { - auto& cu = cuda_native::CCUDAHandlerAccessor::getCUDAFunctionTable(*m_device->getHandler()); - if (!cuda_native::CCUDAHandlerAccessor::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle))) + auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle))) assert(false); } } @@ -53,6 +48,11 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr Date: Sat, 9 May 2026 13:22:30 +0200 Subject: [PATCH 112/149] Clean CUDA interop boundary --- examples_tests | 2 +- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 84 ++++++++++++------- include/nbl/video/CCUDADevice.h | 29 +------ include/nbl/video/CCUDAHandler.h | 4 + include/nbl/video/CUDAInteropHandles.h | 37 ++++---- src/nbl/ext/CUDAInterop/README.md | 48 ++++++----- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 16 ++-- src/nbl/video/CCUDADevice.cpp | 60 +++++++++++-- src/nbl/video/CCUDAExportableMemory.cpp | 2 +- src/nbl/video/CCUDAHandler.cpp | 35 ++++---- src/nbl/video/CCUDAImportedMemory.cpp | 4 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 2 +- 12 files changed, 188 insertions(+), 135 deletions(-) diff --git a/examples_tests b/examples_tests index 2d415af102..e289ee14f5 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 2d415af102ebf710ea2bb369b3f0eca5544652f7 +Subproject commit e289ee14f5b8f05004726e6f03c81a9a2e768219 diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 495f3cabc0..77d248dee2 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -1,16 +1,45 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h +/* + CUDA SDK opt-in boundary for Nabla CUDA interop. + + Public nbl/video CUDA interop headers expose SDK-free cuda_interop::SCU* opaque handles. This header is the + explicit boundary where a consumer accepts CUDA/NVRTC SDK headers, raw CU* types, and Nabla helper APIs whose + signatures use CUDA SDK types. This happens by linking Nabla::ext::CUDAInterop and including this file, which + includes cuda.h and nvrtc.h. The CUDA SDK becomes a compile-time requirement only for that SDK opt-in + consumer. + + The exported definitions stay in Nabla because they are glue between the Nabla world and the CUDA world: + dynamic Driver API/NVRTC loader access, NVRTC program helpers, error handling, runtime header discovery, and + CUDA/Vulkan resource interop lifetime. This header only exposes the CUDA-typed signatures for that glue after + the consumer explicitly opts in. Nabla::ext::CUDAInterop is the build-system edge for this SDK-typed surface. + It is not a separate owner of these definitions. Code that only consumes Nabla::Nabla does not need CUDA SDK + headers and does not parse CUDA/NVRTC declarations. + + Keeping SDK-defined types out of Nabla's public ABI is intentional. CUDA headers have changed observable + compile-time constants across SDK versions: + - CUDA Toolkit 9.0 documented CU_CTX_FLAGS_MASK as 0x1f. CUDA 12.1, 12.5, and 13.2 define it as 0xff. + - CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS as 93. CUDA 12.1, 12.5, + and 13.2 keep 93 as CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 and define the unsuffixed name + as 122. + - CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR as 94. CUDA 12.1, 12.5, + and 13.2 keep 94 as CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 and define the unsuffixed name + as 123. + + If these SDK declarations leak through public Nabla headers, consumers can silently compile against a + different CUDA interpretation than the one used to build the interop implementation. That is especially + problematic for installed packages, plugins, and separately built downstream projects. The opaque handles + keep Nabla's public ABI independent from CUDA SDK headers. This opt-in header then validates handle + size/alignment against the SDK selected by the SDK opt-in consumer. +*/ #ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ #define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ - +#include #include "nbl/video/CUDAInterop.h" - #include "nbl/asset/ICPUBuffer.h" #include "nbl/system/DynamicFunctionCaller.h" -#include - #include "cuda.h" #include "nvrtc.h" #include @@ -148,13 +177,6 @@ struct SCUDADeviceInfo CUuuid uuid = {}; }; -struct SExportableMemoryCreationParams -{ - size_t size; - uint32_t alignment; - CUmemLocationType location; -}; - struct SPTXResult { core::smart_refctd_ptr ptx; @@ -167,7 +189,14 @@ concept cuda_opaque_handle = std::is_trivially_copyable_v && sizeof(Opaque)==sizeof(Native) && alignof(Opaque)==alignof(Native); +/* + Map Nabla opaque handles to CUDA SDK handle types. + This is deliberately small. It is not an attempt to wrap CUDA. It only gives SDK opt-in code a convenient + way to pass Nabla-owned opaque handles to CUDA C APIs while checking that the public opaque type has the same + layout as the CUDA type visible in this translation unit. If a future SDK changes one of these handle layouts, + the SDK opt-in build fails here instead of letting ABI drift propagate through packaged Nabla headers. +*/ template struct SOpaqueCUDAType; @@ -176,7 +205,13 @@ template<> struct SOpaqueCUDAType { using type = CUcon template<> struct SOpaqueCUDAType { using type = CUdeviceptr; }; template<> struct SOpaqueCUDAType { using type = CUexternalMemory; }; template<> struct SOpaqueCUDAType { using type = CUexternalSemaphore; }; +/* + CUDA SDK view of an SDK-free opaque handle. + The conversions are intentionally available only after including this header. Public Nabla headers expose + only the opaque SCU* values. Once a consumer opts in, SNativeHandle restores the CUDA spelling and ergonomics + for raw Driver API calls without adding accessors to every interop operation. +*/ template struct SNativeHandle { @@ -215,19 +250,18 @@ inline bool isBuildCUDAVersionCompatible() const auto buildVersion = CCUDAHandler::getBuildCUDAVersion(); return buildVersion==0u || buildVersion==CUDA_VERSION; } +/* + Nabla interop API declarations with CUDA SDK signatures. -inline bool isDeviceLocal(CUmemLocationType location) -{ - return location==CU_MEM_LOCATION_TYPE_DEVICE; -} - -// Opt-in native CUDA declarations. Nabla owns the definitions. -NBL_API2 const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler); -NBL_API2 const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler); + These declarations belong to the Nabla interop API. They live behind Nabla::ext::CUDAInterop because their + signatures mention CUDA/NVRTC SDK types directly. Keeping them out of nbl/video/CCUDA*.h means Nabla's public + API can be parsed and packaged without CUDA SDK headers. Nabla still owns the exported glue definitions. + Consumers accept this SDK-typed API surface only by including this header and linking the explicit interop + target. +*/ NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); -NBL_API2 const core::vector& getAvailableDevices(const CCUDAHandler& handler); NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); @@ -237,16 +271,6 @@ NBL_API2 SPTXResult compileDirectlyToPTX( std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr ); -inline size_t roundToGranularity(const CCUDADevice& device, CUmemLocationType location, size_t size) -{ - return device.roundToGranularity(static_cast(location),size); -} - -inline core::smart_refctd_ptr createExportableMemory(CCUDADevice& device, SExportableMemoryCreationParams&& params) -{ - return device.createExportableMemory({params.size,params.alignment,static_cast(params.location)}); -} - } #endif diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index d4eb711cd2..0f8bd015ed 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -10,7 +10,6 @@ #include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDAImportedSemaphore.h" -#include #include #include @@ -50,32 +49,13 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted EVA_80, EVA_COUNT }; - static inline constexpr const char* virtualArchCompileOption[] = { - "-arch=compute_30", - "-arch=compute_32", - "-arch=compute_35", - "-arch=compute_37", - "-arch=compute_50", - "-arch=compute_52", - "-arch=compute_53", - "-arch=compute_60", - "-arch=compute_61", - "-arch=compute_62", - "-arch=compute_70", - "-arch=compute_72", - "-arch=compute_75", - "-arch=compute_80" - }; - inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} + E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() const; ~CCUDADevice() override; - inline core::SRange geDefaultCompileOptions() const - { - return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; - } + core::SRange geDefaultCompileOptions() const; - const CCUDAHandler* getHandler() const { return m_handler.get(); } + const CCUDAHandler* getHandler() const; cuda_interop::SCUdevice getInternalObject() const; cuda_interop::SCUcontext getContext() const; @@ -88,8 +68,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted size_t roundToGranularity(uint32_t locationType, size_t size) const; - bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } - core::smart_refctd_ptr createExportableMemory(SExportableMemoryCreationParams&& params); core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); @@ -105,7 +83,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - IPhysicalDevice* const m_physicalDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; core::smart_refctd_ptr m_handler; diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 2ce6541696..db30b08587 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -23,6 +23,8 @@ class IPhysicalDevice; namespace cuda_native { +class CUDA; +class NVRTC; struct SAccess; } @@ -51,6 +53,8 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted public: static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); static uint32_t getBuildCUDAVersion(); + const cuda_native::CUDA& getCUDAFunctionTable() const; + const cuda_native::NVRTC& getNVRTCFunctionTable() const; inline core::SRange getSTDHeaders() { diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index 88401f6a1f..741a04c319 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -10,30 +10,25 @@ namespace nbl::video::cuda_interop { -struct alignas(alignof(int32_t)) SCUdevice +/* + SDK-free CUDA handle surrogates used by Nabla's public video API. + + These types are the small glue layer between Nabla and SDK-typed CUDA interop code. They let nbl/video/CCUDA*.h + expose CUDA-related objects without including cuda.h or nvrtc.h, so consumers that only link Nabla::Nabla do + not inherit CUDA SDK as a public compile-time dependency. CUDAInteropNative.h maps these opaque handles back + to the real CU* types and checks their size/alignment against the SDK selected by the opt-in consumer. +*/ +template +struct alignas(alignof(Storage)) SOpaqueCUDAHandle { - uint8_t value[sizeof(int32_t)] = {}; + uint8_t value[sizeof(Storage)] = {}; }; -struct alignas(alignof(void*)) SCUcontext -{ - uint8_t value[sizeof(void*)] = {}; -}; - -struct alignas(alignof(uintptr_t)) SCUdeviceptr -{ - uint8_t value[sizeof(uintptr_t)] = {}; -}; - -struct alignas(alignof(void*)) SCUexternalMemory -{ - uint8_t value[sizeof(void*)] = {}; -}; - -struct alignas(alignof(void*)) SCUexternalSemaphore -{ - uint8_t value[sizeof(void*)] = {}; -}; +struct SCUdevice : SOpaqueCUDAHandle {}; +struct SCUcontext : SOpaqueCUDAHandle {}; +struct SCUdeviceptr : SOpaqueCUDAHandle {}; +struct SCUexternalMemory : SOpaqueCUDAHandle {}; +struct SCUexternalSemaphore : SOpaqueCUDAHandle {}; } diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index fca288a98f..a55bafbb9f 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -5,19 +5,19 @@ - `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and the implementation in `src/nbl/video/CCUDA*.cpp`. - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers. - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`. -- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the native opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. -- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA native types with `cuda_native::SNativeHandle`. +- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. +- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA SDK types with `cuda_native::SNativeHandle`. ## CMake Usage -Default Nabla usage stays SDK-free: +`Nabla::Nabla`-only usage stays SDK-free: ```cmake find_package(Nabla CONFIG REQUIRED) target_link_libraries(app PRIVATE Nabla::Nabla) ``` -Native CUDA interop is explicit: +SDK-typed CUDA interop is explicit: ```cmake find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) @@ -39,15 +39,15 @@ nbl_target_link_cuda_interop(native_app PRIVATE ) ``` -Consumers can also choose the SDK used for native compilation with: +Consumers can also choose the SDK used for SDK-typed compilation with: ```cmake cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT= ``` -This affects native opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI. +This affects SDK opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI. -## Native Usage +## SDK Opt-In Usage ```cpp #include "nbl/ext/CUDAInterop/CUDAInteropNative.h" @@ -58,10 +58,10 @@ auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDev if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible()) return false; -auto memory = nbl::video::cuda_native::createExportableMemory(*cudaDevice, { +auto memory = cudaDevice->createExportableMemory({ .size = size, .alignment = alignment, - .location = CU_MEM_LOCATION_TYPE_DEVICE, + .locationType = CU_MEM_LOCATION_TYPE_DEVICE, }); nbl::video::cuda_native::SCUdeviceptr mapped; @@ -70,6 +70,8 @@ if (importedMemory) CUdeviceptr rawMapped = mapped; CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr()); +auto& cu = handler->getCUDAFunctionTable(); +auto& nvrtc = handler->getNVRTCFunctionTable(); std::string log; auto compile = nbl::video::cuda_native::compileDirectlyToPTX( @@ -81,46 +83,46 @@ auto compile = nbl::video::cuda_native::compileDirectlyToPTX( ); ``` -Native access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop: +SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop: -- `cuda_native::getCUDAFunctionTable` and `cuda_native::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables. -- `cuda_native::SNativeHandle` converts between SDK-free Nabla opaque handles and CUDA native handles such as `CUdeviceptr`. -- `cuda_native::createExportableMemory` and `cuda_native::roundToGranularity` keep CUDA enum usage in the opt-in header while Nabla stores only integer/opaque data in its public ABI. -- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. Native opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`. +- `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in. +- `cuda_native::SNativeHandle` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`. +- CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. +- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`. - `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. Smoke examples: - `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free. -- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks default package usage without native opt-in. -- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks native opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage. +- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks `Nabla::Nabla` package usage without SDK opt-in. +- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage. ## ABI - `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI. - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. -- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA native handle storage. The native opt-in header validates this with `static_assert`s against the SDK used by the consumer. +- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer. - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. - SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla. -- A consumer can build native opt-in code with its own compatible SDK independently from the SDK used to build Nabla. Leaky/native code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required. +- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required. - Runtime include-option construction is header-only and is not part of the exported ABI. - The loaded CUDA driver and NVRTC runtime are validated at runtime. ## Runtime Header Discovery -NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a default `Nabla::Nabla` package requirement. +NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a `Nabla::Nabla` package requirement. -- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into native CUDA interop. +- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into SDK-typed CUDA interop. - The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths. - Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. -- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Default discovery is cached after the first call. +- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Discovery is cached after the first call. Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. -Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or native CUDA code. +Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or SDK-typed CUDA code. NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See: @@ -149,7 +151,7 @@ CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On ## Related Designs -The split follows the same boundary pattern used by mature GPU projects: default headers avoid vendor SDK requirements, native access is explicit, and implementation details stay outside the default public API. +The split follows the same boundary pattern used by mature GPU projects: public/common headers avoid vendor SDK requirements, vendor SDK access is explicit, and implementation details stay outside the public API. - OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79 - OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61 diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 52fdb43539..f760b78a1c 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -23,10 +23,10 @@ using namespace nbl::video; core::smart_refctd_ptr vulkanMemory, core::smart_refctd_ptr vulkanSemaphore) { - auto cudaMemory = cuda_native::createExportableMemory(cudaDevice, { + auto cudaMemory = cudaDevice.createExportableMemory({ .size = 4096, .alignment = 4096, - .location = CU_MEM_LOCATION_TYPE_DEVICE, + .locationType = CU_MEM_LOCATION_TYPE_DEVICE, }); if (!cudaMemory) return false; @@ -48,7 +48,7 @@ using namespace nbl::video; bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) { - auto& cuda = cuda_native::getCUDAFunctionTable(handler); + auto& cuda = handler.getCUDAFunctionTable(); CUcontext context = nullptr; if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) @@ -151,11 +151,15 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!cudaFp16HeaderCompileProbe(*handler)) return false; - const auto& devices = nbl::video::cuda_native::getAvailableDevices(*handler); - if (devices.empty()) + int deviceCount = 0; + if (handler->getCUDAFunctionTable().pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount==0) return true; - return cudaDriverRoundtrip(*handler, devices.front().handle); + CUdevice device = {}; + if (handler->getCUDAFunctionTable().pcuDeviceGet(&device,0)!=CUDA_SUCCESS) + return false; + + return cudaDriverRoundtrip(*handler, device); } void workLoopBody() override {} diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 2ed02a6282..426b900f4d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -3,6 +3,26 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CUDAInterop.h" +namespace nbl::video +{ + +CCUDADevice::E_VIRTUAL_ARCHITECTURE CCUDADevice::getVirtualArchitecture() const +{ + return m_virtualArchitecture; +} + +core::SRange CCUDADevice::geDefaultCompileOptions() const +{ + return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; +} + +const CCUDAHandler* CCUDADevice::getHandler() const +{ + return m_handler.get(); +} + +} + #ifdef _NBL_COMPILE_WITH_CUDA_ #include "CUDAInteropNativeState.hpp" @@ -13,6 +33,30 @@ namespace nbl::video { +namespace +{ + +constexpr const char* VirtualArchCompileOption[] = { + "-arch=compute_30", + "-arch=compute_32", + "-arch=compute_35", + "-arch=compute_37", + "-arch=compute_50", + "-arch=compute_52", + "-arch=compute_53", + "-arch=compute_60", + "-arch=compute_61", + "-arch=compute_62", + "-arch=compute_70", + "-arch=compute_72", + "-arch=compute_75", + "-arch=compute_80" +}; + +static_assert(sizeof(VirtualArchCompileOption)/sizeof(*VirtualArchCompileOption)==CCUDADevice::EVA_COUNT); + +} + CCUDADevice::CCUDADevice( core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, @@ -22,7 +66,6 @@ CCUDADevice::CCUDADevice( m_logger(vulkanDevice->getDebugCallback()->getLogger()), m_defaultCompileOptions(), m_vulkanConnection(std::move(vulkanConnection)), - m_physicalDevice(vulkanDevice), m_virtualArchitecture(virtualArchitecture), m_handler(std::move(handler)), m_native(std::move(nativeState)) @@ -30,11 +73,11 @@ CCUDADevice::CCUDADevice( assert(m_native); m_defaultCompileOptions.push_back("--std=c++14"); - m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); + m_defaultCompileOptions.push_back(VirtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); + const auto& cu = m_handler->getCUDAFunctionTable(); if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) assert(false); @@ -92,7 +135,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept { const auto handler = device.getHandler(); const auto& native = cuda_native::SAccess::native(device); - const auto& cu = cuda_native::getCUDAFunctionTable(*handler); + const auto& cu = handler->getCUDAFunctionTable(); CUdeviceptr ptr = 0; if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) @@ -140,7 +183,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor if (params.granularSize==0u) return nullptr; - auto& cu = cuda_native::getCUDAFunctionTable(*handler); + auto& cu = handler->getCUDAFunctionTable(); #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { @@ -199,7 +242,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - const auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); + const auto& cu = m_handler->getCUDAFunctionTable(); const auto handleType = mem->getCreationParams().externalHandleType; if (!handleType) return nullptr; @@ -230,7 +273,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - auto& cu = cuda_native::getCUDAFunctionTable(*m_handler); + auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = sema->getCreationParams().externalHandleTypes.value; if (!handleType) @@ -263,7 +306,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CCUDADevice::~CCUDADevice() { - if (!cuda_native::defaultHandleResult(*m_handler, cuda_native::getCUDAFunctionTable(*m_handler).pcuCtxDestroy_v2(m_native->context))) + if (!cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context))) assert(false); } @@ -285,7 +328,6 @@ CCUDADevice::CCUDADevice( core::smart_refctd_ptr&& handler) : m_logger(nullptr) , m_vulkanConnection(std::move(vulkanConnection)) - , m_physicalDevice(vulkanDevice) , m_virtualArchitecture(virtualArchitecture) , m_handler(std::move(handler)) , m_native(std::move(nativeState)) diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 6c77736628..6458fe5af3 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -52,7 +52,7 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM CCUDAExportableMemory::~CCUDAExportableMemory() { - const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize))) assert(false); diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 4168612e61..9e40942914 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -345,6 +345,16 @@ uint32_t CCUDAHandler::getBuildCUDAVersion() return CUDA_VERSION; } +const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const +{ + return cuda_native::SAccess::native(*this).cuda; +} + +const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const +{ + return cuda_native::SAccess::native(*this).nvrtc; +} + namespace cuda_native { @@ -867,21 +877,6 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste namespace cuda_native { -const CUDA& getCUDAFunctionTable(const CCUDAHandler& handler) -{ - return SAccess::native(handler).cuda; -} - -const NVRTC& getNVRTCFunctionTable(const CCUDAHandler& handler) -{ - return SAccess::native(handler).nvrtc; -} - -const core::vector& getAvailableDevices(const CCUDAHandler& handler) -{ - return SAccess::native(handler).availableDevices; -} - nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) @@ -1105,6 +1100,16 @@ uint32_t CCUDAHandler::getBuildCUDAVersion() return 0u; } +const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const +{ + std::abort(); +} + +const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const +{ + std::abort(); +} + core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) { return nullptr; diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 404323a365..168fce511e 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -33,7 +33,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff bufferDesc.size = m_src->getAllocationSize(); CUdeviceptr nativeMappedBuffer = 0; - const auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc); if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result)) return false; @@ -44,7 +44,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff CCUDAImportedMemory::~CCUDAImportedMemory() { - auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle))) assert(false); } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 4fce78788c..3296d16a60 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -24,7 +24,7 @@ cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() c CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { - auto& cu = cuda_native::getCUDAFunctionTable(*m_device->getHandler()); + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle))) assert(false); } From fce838b8caaa587d5dcde0dae436ad1281221ef8 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sat, 9 May 2026 14:20:06 +0200 Subject: [PATCH 113/149] Polish CUDA interop cleanup --- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 7 +++ include/nbl/video/CCUDADevice.h | 12 ++++- include/nbl/video/EApiType.h | 33 ++++++++++++- src/nbl/CMakeLists.txt | 1 - src/nbl/video/CCUDADevice.cpp | 48 ++++++------------- src/nbl/video/CCUDAExportableMemory.cpp | 10 ++-- src/nbl/video/CCUDAImportedMemory.cpp | 3 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 3 +- src/nbl/video/CUDAInteropNativeState.hpp | 1 - src/nbl/video/EApiType.cpp | 37 -------------- 10 files changed, 70 insertions(+), 85 deletions(-) delete mode 100644 src/nbl/video/EApiType.cpp diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 77d248dee2..2ec4b723c0 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -42,6 +42,7 @@ #include "cuda.h" #include "nvrtc.h" +#include #include #if CUDA_VERSION < 13000 #error "Need CUDA 13.0 SDK or higher." @@ -262,6 +263,12 @@ inline bool isBuildCUDAVersionCompatible() NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); +#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ + do { \ + const auto nblCudaInteropResult = (expr); \ + if (!::nbl::video::cuda_native::defaultHandleResult(*(handler), nblCudaInteropResult)) \ + assert(false); \ + } while(0) NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 0f8bd015ed..5acfd35831 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -10,6 +10,7 @@ #include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDAImportedSemaphore.h" +#include #include #include @@ -66,7 +67,15 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted uint32_t locationType; }; - size_t roundToGranularity(uint32_t locationType, size_t size) const; + inline size_t roundToGranularity(uint32_t locationType, size_t size) const + { + if (locationType>=m_allocationGranularity.size()) + return 0u; + const auto granularity = m_allocationGranularity[locationType]; + if (size==0u || granularity==0u) + return 0u; + return ((size - 1) / granularity + 1) * granularity; + } core::smart_refctd_ptr createExportableMemory(SExportableMemoryCreationParams&& params); core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); @@ -83,6 +92,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; + std::array m_allocationGranularity = {}; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; core::smart_refctd_ptr m_handler; diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 44a31ecf90..9b1a79e4d4 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -3,6 +3,14 @@ #include "nbl/core/declarations.h" #include +#ifdef _WIN32 + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN + #endif + #include +#else + #include +#endif namespace nbl::video { @@ -28,8 +36,29 @@ constexpr external_handle_t ExternalHandleNull = nullptr; constexpr external_handle_t ExternalHandleNull = -1; #endif -NBL_API2 bool CloseExternalHandle(external_handle_t handle); -NBL_API2 external_handle_t DuplicateExternalHandle(external_handle_t handle); +inline bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return close(handle)==0; +#endif +} + +inline external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE duplicated = ExternalHandleNull; + + const HANDLE process = GetCurrentProcess(); + if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return duplicated; +#else + return dup(handle); +#endif +} } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index d56c223e34..317cf3d2a1 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -257,7 +257,6 @@ set(NBL_VIDEO_SOURCES video/IGPUAccelerationStructure.cpp video/IGPUCommandBuffer.cpp video/IQueue.cpp - video/EApiType.cpp video/IGPUDescriptorSet.cpp video/IDeviceMemoryAllocation.cpp video/IDeviceMemoryBacked.cpp diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 426b900f4d..802e224793 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -84,7 +84,7 @@ CCUDADevice::CCUDADevice( if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context))) assert(false); - for (uint32_t locationType = 0; locationType < m_native->allocationGranularity.size(); ++locationType) + for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { #ifdef _WIN32 @@ -101,7 +101,7 @@ CCUDADevice::CCUDADevice( .win32HandleMetaData = &metadata, #endif }; - if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_native->allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) + if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) assert(false); } } @@ -116,16 +116,6 @@ cuda_interop::SCUcontext CCUDADevice::getContext() const return cuda_native::SCUcontext(cuda_native::SAccess::native(*this).context).asOpaque(); } -size_t CCUDADevice::roundToGranularity(uint32_t locationType, size_t size) const -{ - if (locationType>=m_native->allocationGranularity.size()) - return 0u; - const auto& granularity = m_native->allocationGranularity[locationType]; - if (granularity==0u) - return 0u; - return ((size - 1) / granularity + 1) * granularity; -} - static bool isDeviceLocal(CUmemLocationType location) { return location==CU_MEM_LOCATION_TYPE_DEVICE; @@ -143,8 +133,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) - assert(false); + cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)); return err; } @@ -155,10 +144,8 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size))) - assert(false); - if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size))) - assert(false); + cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)); + cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)); return err; } @@ -212,8 +199,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); - if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) - assert(false); + cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)); return nullptr; } @@ -221,19 +207,21 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor { logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - if (!cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem))) - assert(false); + cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)); - bool closeSucceed = CloseExternalHandle(params.externalHandle); - assert(closeSucceed); + if (!CloseExternalHandle(params.externalHandle)) + logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; } if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - bool closeSucceed = CloseExternalHandle(params.externalHandle); - assert(closeSucceed); + cuda_native::defaultHandleResult(*handler, err); + cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize)); + cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize)); + if (!CloseExternalHandle(params.externalHandle)) + logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; } @@ -306,8 +294,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CCUDADevice::~CCUDADevice() { - if (!cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context))) - assert(false); + cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)); } } @@ -347,11 +334,6 @@ cuda_interop::SCUcontext CCUDADevice::getContext() const return {}; } -size_t CCUDADevice::roundToGranularity(uint32_t, size_t) const -{ - return 0u; -} - core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&&) { return nullptr; diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 6458fe5af3..2696fe1ebd 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -54,14 +54,12 @@ CCUDAExportableMemory::~CCUDAExportableMemory() { const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize))) - assert(false); + cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)); - if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize))) - assert(false); + cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)); - bool closeSucceed = CloseExternalHandle(m_params.externalHandle); - assert(closeSucceed); + if (!CloseExternalHandle(m_params.externalHandle)) + cuda_native::SAccess::logger(*m_device).log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 168fce511e..cff48931c0 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -45,8 +45,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff CCUDAImportedMemory::~CCUDAImportedMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle))) - assert(false); + cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)); } } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 3296d16a60..6a43fefc4c 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -25,8 +25,7 @@ cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() c CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (!cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle))) - assert(false); + cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)); } } diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp index 4be8178aa2..743bd10c3e 100644 --- a/src/nbl/video/CUDAInteropNativeState.hpp +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -31,7 +31,6 @@ struct CCUDADevice::SNativeState { CUdevice handle = {}; CUcontext context = nullptr; - std::array allocationGranularity = {}; explicit SNativeState(CUdevice _handle) : handle(_handle) diff --git a/src/nbl/video/EApiType.cpp b/src/nbl/video/EApiType.cpp deleted file mode 100644 index d7eadd8b08..0000000000 --- a/src/nbl/video/EApiType.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include "nbl/video/EApiType.h" - -#ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#include -#else -#include -#endif - -namespace nbl::video -{ - -bool CloseExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - return CloseHandle(handle); -#else - return close(handle)==0; -#endif -} - -external_handle_t DuplicateExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - HANDLE duplicated = ExternalHandleNull; - - const HANDLE process = GetCurrentProcess(); - if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS)) - return ExternalHandleNull; - - return duplicated; -#else - return dup(handle); -#endif -} - -} From 9f2d5feae119315e46db89fbf54e4500694cec56 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sat, 9 May 2026 16:57:14 +0200 Subject: [PATCH 114/149] Simplify CUDA interop native boundary --- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 100 +++++------------- include/nbl/system/DynamicFunctionCaller.h | 14 ++- include/nbl/video/CCUDADevice.h | 6 -- include/nbl/video/CCUDAExportableMemory.h | 7 +- include/nbl/video/CCUDAHandler.h | 6 +- include/nbl/video/CCUDAImportedMemory.h | 7 +- include/nbl/video/CCUDAImportedSemaphore.h | 6 -- include/nbl/video/CUDAInteropHandles.h | 42 ++++++++ src/nbl/ext/CUDAInterop/README.md | 28 +++-- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 10 +- src/nbl/video/CCUDADevice.cpp | 55 +++++----- src/nbl/video/CCUDAExportableMemory.cpp | 4 +- src/nbl/video/CCUDAHandler.cpp | 34 +++--- src/nbl/video/CCUDAImportedMemory.cpp | 4 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 2 +- src/nbl/video/CUDAInteropNativeState.hpp | 45 -------- 16 files changed, 169 insertions(+), 201 deletions(-) diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 2ec4b723c0..0e08fb2b97 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -39,21 +39,25 @@ #include "nbl/video/CUDAInterop.h" #include "nbl/asset/ICPUBuffer.h" #include "nbl/system/DynamicFunctionCaller.h" - #include "cuda.h" #include "nvrtc.h" -#include -#include -#if CUDA_VERSION < 13000 - #error "Need CUDA 13.0 SDK or higher." -#endif - namespace nbl::video::cuda_native { inline constexpr int MinimumCUDADriverVersion = 13000; inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; +static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher."); + +/* + The CUDA/NVRTC table classes below contain the calls used and tested by Nabla's interop implementation. + After including this SDK opt-in header, consumer code can load extra Driver API or NVRTC symbols from the + same loaded libraries without changing Nabla's ABI: + + auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); + The requested symbol must be declared by the CUDA SDK headers visible to this translation unit because the + helper uses decltype(cuNewCall) to preserve the native function signature. +*/ using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader @@ -184,73 +188,33 @@ struct SPTXResult nvrtcResult result; }; -template -concept cuda_opaque_handle = - std::is_trivially_copyable_v && - std::is_trivially_copyable_v && - sizeof(Opaque)==sizeof(Native) && - alignof(Opaque)==alignof(Native); -/* - Map Nabla opaque handles to CUDA SDK handle types. - - This is deliberately small. It is not an attempt to wrap CUDA. It only gives SDK opt-in code a convenient - way to pass Nabla-owned opaque handles to CUDA C APIs while checking that the public opaque type has the same - layout as the CUDA type visible in this translation unit. If a future SDK changes one of these handle layouts, - the SDK opt-in build fails here instead of letting ABI drift propagate through packaged Nabla headers. -*/ -template -struct SOpaqueCUDAType; - -template<> struct SOpaqueCUDAType { using type = CUdevice; }; -template<> struct SOpaqueCUDAType { using type = CUcontext; }; -template<> struct SOpaqueCUDAType { using type = CUdeviceptr; }; -template<> struct SOpaqueCUDAType { using type = CUexternalMemory; }; -template<> struct SOpaqueCUDAType { using type = CUexternalSemaphore; }; /* CUDA SDK view of an SDK-free opaque handle. The conversions are intentionally available only after including this header. Public Nabla headers expose - only the opaque SCU* values. Once a consumer opts in, SNativeHandle restores the CUDA spelling and ergonomics - for raw Driver API calls without adding accessors to every interop operation. + only the opaque SCU* values. Once a consumer opts in, the aliases below restore the CUDA spelling and + ergonomics for raw Driver API calls without adding accessors to every interop operation. Each alias maps one + Nabla opaque handle to the matching CUDA SDK handle and validates size/alignment against the SDK selected by + this opt-in translation unit. */ -template -struct SNativeHandle -{ - using cuda_t = typename SOpaqueCUDAType::type; - static_assert(cuda_opaque_handle); - - SNativeHandle() = default; - SNativeHandle(const SNativeHandle&) = default; - SNativeHandle(const cuda_t& native) { operator=(native); } - SNativeHandle(const Opaque& opaque) { operator=(opaque); } - - SNativeHandle& operator=(const SNativeHandle&) = default; - SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; } - SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; } +using SCUdevice = cuda_interop::SNativeHandle; +using SCUcontext = cuda_interop::SNativeHandle; +using SCUdeviceptr = cuda_interop::SNativeHandle; +using SCUexternalMemory = cuda_interop::SNativeHandle; +using SCUexternalSemaphore = cuda_interop::SNativeHandle; - operator cuda_t&() { return value; } - operator const cuda_t&() const { return value; } - operator Opaque&() { return reinterpret_cast(value); } - operator const Opaque&() const { return reinterpret_cast(value); } - - Opaque* opaque() { return &static_cast(*this); } - const Opaque* opaque() const { return &static_cast(*this); } - Opaque asOpaque() const { return static_cast(*this); } - - cuda_t value = {}; -}; - -using SCUdevice = SNativeHandle; -using SCUcontext = SNativeHandle; -using SCUdeviceptr = SNativeHandle; -using SCUexternalMemory = SNativeHandle; -using SCUexternalSemaphore = SNativeHandle; - -inline bool isBuildCUDAVersionCompatible() +/* + Check whether this opt-in translation unit uses the exact CUDA SDK version that was used to build Nabla's + CUDA interop implementation. Opaque handle layout is checked by SNativeHandle aliases above. This exact + version check is a policy helper for SDK-typed code that wants to warn about or reject compatible-but-different + SDK headers. +*/ +inline bool isBuildCUDASDKVersionExactMatch() { - const auto buildVersion = CCUDAHandler::getBuildCUDAVersion(); + const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion(); return buildVersion==0u || buildVersion==CUDA_VERSION; } + /* Nabla interop API declarations with CUDA SDK signatures. @@ -263,12 +227,6 @@ inline bool isBuildCUDAVersionCompatible() NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); -#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ - do { \ - const auto nblCudaInteropResult = (expr); \ - if (!::nbl::video::cuda_native::defaultHandleResult(*(handler), nblCudaInteropResult)) \ - assert(false); \ - } while(0) NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); diff --git a/include/nbl/system/DynamicFunctionCaller.h b/include/nbl/system/DynamicFunctionCaller.h index cf99be32f0..d5642d3ea9 100644 --- a/include/nbl/system/DynamicFunctionCaller.h +++ b/include/nbl/system/DynamicFunctionCaller.h @@ -16,7 +16,7 @@ class DynamicFunctionCallerBase : public core::Unmovable { protected: static_assert(std::is_base_of::value, "Need a function pointer loader derived from `FuncPtrLoader`"); - FuncPtrLoaderT loader; + mutable FuncPtrLoaderT loader; public: //DynamicFunctionCallerBase() : loader() {} DynamicFunctionCallerBase(DynamicFunctionCallerBase&& other) : DynamicFunctionCallerBase() @@ -29,6 +29,16 @@ class DynamicFunctionCallerBase : public core::Unmovable } virtual ~DynamicFunctionCallerBase() = default; + inline bool isLibraryLoaded() const + { + return loader.isLibraryLoaded(); + } + + inline void* loadFuncPtr(const char* funcname) const + { + return loader.loadFuncPtr(funcname); + } + DynamicFunctionCallerBase& operator=(DynamicFunctionCallerBase&& other) { std::swap(loader, other.loader); @@ -41,6 +51,8 @@ class DynamicFunctionCallerBase : public core::Unmovable #define NBL_SYSTEM_IMPL_INIT_DYNLIB_FUNCPTR(FUNC_NAME) ,NBL_CONCATENATE(p , FUNC_NAME)(Base::loader.loadFuncPtr( #FUNC_NAME )) #define NBL_SYSTEM_IMPL_SWAP_DYNLIB_FUNCPTR(FUNC_NAME) std::swap(NBL_CONCATENATE(p, FUNC_NAME),other.NBL_CONCATENATE(p, FUNC_NAME)); +// Load an extra function from an already loaded dynamic library without adding it to the generated caller class. +#define NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(CALLER, FUNC_NAME) nbl::system::DynamicLibraryFunctionPointer((CALLER).loadFuncPtr(#FUNC_NAME)) #define NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS( CLASS_NAME, FUNC_PTR_LOADER_TYPE, ... ) \ class CLASS_NAME : public nbl::system::DynamicFunctionCallerBase\ diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 5acfd35831..4658e51a10 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -18,11 +18,6 @@ namespace nbl::video { class CCUDAHandler; -namespace cuda_native -{ -struct SAccess; -} - class NBL_API2 CCUDADevice : public core::IReferenceCounted { public: @@ -84,7 +79,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted private: friend class CCUDAHandler; - friend struct cuda_native::SAccess; struct SNativeState; CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 6243bd8c73..510f483b3b 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -14,11 +14,6 @@ namespace nbl::video { class CCUDADevice; -namespace cuda_native -{ -struct SAccess; -} - class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { public: @@ -37,7 +32,7 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; private: - friend struct cuda_native::SAccess; + friend class CCUDADevice; struct SNativeState; CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index db30b08587..a77ab66b68 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -25,7 +25,6 @@ namespace cuda_native { class CUDA; class NVRTC; -struct SAccess; } namespace cuda_interop @@ -52,9 +51,10 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); - static uint32_t getBuildCUDAVersion(); + static uint32_t getBuildCUDASDKVersion(); const cuda_native::CUDA& getCUDAFunctionTable() const; const cuda_native::NVRTC& getNVRTCFunctionTable() const; + inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); } inline core::SRange getSTDHeaders() { @@ -80,8 +80,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ~CCUDAHandler() override; private: - friend struct cuda_native::SAccess; - struct SNativeState; CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger); diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 454088b7ae..e2c9bb6db6 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -12,21 +12,16 @@ namespace nbl::video class CCUDADevice; -namespace cuda_native -{ -struct SAccess; -} - class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted { public: ~CCUDAImportedMemory() override; cuda_interop::SCUexternalMemory getInternalObject() const; bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const; + bool getMappedBuffer(cuda_interop::SCUdeviceptr& mappedBuffer) const { return getMappedBuffer(&mappedBuffer); } private: friend class CCUDADevice; - friend struct cuda_native::SAccess; struct SNativeState; CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index 5a4f28abde..7f2b266383 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -15,11 +15,6 @@ namespace nbl::video class CCUDADevice; -namespace cuda_native -{ -struct SAccess; -} - class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { public: @@ -28,7 +23,6 @@ class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted private: friend class CCUDADevice; - friend struct cuda_native::SAccess; struct SNativeState; CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index 741a04c319..92888d3ccf 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -6,6 +6,7 @@ #include #include +#include namespace nbl::video::cuda_interop { @@ -30,6 +31,47 @@ struct SCUdeviceptr : SOpaqueCUDAHandle {}; struct SCUexternalMemory : SOpaqueCUDAHandle {}; struct SCUexternalSemaphore : SOpaqueCUDAHandle {}; +template +concept cuda_opaque_handle = + std::is_trivially_copyable_v && + std::is_trivially_copyable_v && + sizeof(Opaque)==sizeof(Native) && + alignof(Opaque)==alignof(Native); + +/* + Native view of an SDK-free opaque handle. + + This template does not depend on CUDA SDK types by itself. CUDAInteropNative.h binds it to concrete CU* types + after the consumer opts into CUDA SDK headers. The layout check keeps the public opaque handle and the native + SDK handle compatible in that translation unit while preserving Nabla's SDK-free public headers. +*/ +template +struct SNativeHandle +{ + using cuda_t = Native; + static_assert(cuda_opaque_handle); + + SNativeHandle() = default; + SNativeHandle(const SNativeHandle&) = default; + SNativeHandle(const cuda_t& native) { operator=(native); } + SNativeHandle(const Opaque& opaque) { operator=(opaque); } + + SNativeHandle& operator=(const SNativeHandle&) = default; + SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; } + SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; } + + operator cuda_t&() { return value; } + operator const cuda_t&() const { return value; } + operator Opaque&() { return reinterpret_cast(value); } + operator const Opaque&() const { return reinterpret_cast(value); } + + Opaque* opaque() { return &static_cast(*this); } + const Opaque* opaque() const { return &static_cast(*this); } + Opaque asOpaque() const { return static_cast(*this); } + + cuda_t value = {}; +}; + } #endif diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index a55bafbb9f..231658e949 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -6,7 +6,7 @@ - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers. - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`. - `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. -- `CUDAInteropNative.h` is the only opt-in header that includes CUDA SDK headers. It maps Nabla opaque handles to CUDA SDK types with `cuda_native::SNativeHandle`. +- `CUDAInteropNative.h` is the opt-in SDK boundary. It includes CUDA SDK headers and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`. ## CMake Usage @@ -55,8 +55,11 @@ This affects SDK opt-in compilation and generated runtime header discovery only. auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger)); auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice); -if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible()) - return false; +const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); +if (!exactBuildSDK) +{ + // Warn here, or return false if this application requires exact same-SDK policy. +} auto memory = cudaDevice->createExportableMemory({ .size = size, @@ -66,7 +69,7 @@ auto memory = cudaDevice->createExportableMemory({ nbl::video::cuda_native::SCUdeviceptr mapped; if (importedMemory) - importedMemory->getMappedBuffer(mapped.opaque()); + importedMemory->getMappedBuffer(mapped); CUdeviceptr rawMapped = mapped; CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr()); @@ -86,16 +89,25 @@ auto compile = nbl::video::cuda_native::compileDirectlyToPTX( SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop: - `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in. -- `cuda_native::SNativeHandle` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`. +- The default tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit: + +```cpp +auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); +if (pcuNewCall) + pcuNewCall(...); +``` + +- `cuda_interop::SNativeHandle` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`. The template itself is SDK-free. `CUDAInteropNative.h` only provides CUDA-typed aliases. - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. -- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr::opaque()` and then use the wrapper as `CUdeviceptr`. +- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr` directly and then use it as `CUdeviceptr`. - `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. +- `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. Smoke examples: - `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free. - `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks `Nabla::Nabla` package usage without SDK opt-in. -- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, and raw interop usage. +- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, extra dynamic symbol loading, and raw interop usage. ## ABI @@ -104,7 +116,7 @@ Smoke examples: - Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer. - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. - SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla. -- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDAVersionCompatible()` when exact CUDA SDK version matching is required. +- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDASDKVersionExactMatch()` when exact CUDA SDK version matching is required. - Runtime include-option construction is header-only and is not part of the exported ABI. - The loaded CUDA driver and NVRTC runtime are validated at runtime. diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index f760b78a1c..79e85555b7 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -37,7 +37,7 @@ using namespace nbl::video; cuda_native::SCUdeviceptr mappedVulkanMemory; if (importedFromVulkan) - importedFromVulkan->getMappedBuffer(mappedVulkanMemory.opaque()); + importedFromVulkan->getMappedBuffer(mappedVulkanMemory); const CUdeviceptr cudaDevicePtr = cuda_native::SCUdeviceptr(cudaMemory->getDeviceptr()); CUexternalSemaphore cudaSemaphore = nullptr; @@ -127,8 +127,7 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew return false; static_assert(std::is_same_v); - if (!nbl::video::cuda_native::isBuildCUDAVersionCompatible()) - return false; + [[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}); @@ -148,6 +147,11 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!handler) return true; + auto pcuDriverGetVersion = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuDriverGetVersion); + int loadedDriverVersion = 0; + if (!pcuDriverGetVersion || pcuDriverGetVersion(&loadedDriverVersion)!=CUDA_SUCCESS || loadedDriverVersion==0) + return false; + if (!cudaFp16HeaderCompileProbe(*handler)) return false; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 802e224793..1c73068a6d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -55,6 +55,15 @@ constexpr const char* VirtualArchCompileOption[] = { static_assert(sizeof(VirtualArchCompileOption)/sizeof(*VirtualArchCompileOption)==CCUDADevice::EVA_COUNT); +static CUmemAllocationHandleType getAllocationHandleType() +{ +#ifdef _WIN32 + return CU_MEM_HANDLE_TYPE_WIN32; +#else + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif +} + } CCUDADevice::CCUDADevice( @@ -95,7 +104,7 @@ CCUDADevice::CCUDADevice( const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(), + .requestedHandleTypes = getAllocationHandleType(), .location = { .type = static_cast(locationType), .id = m_native->handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, @@ -108,12 +117,12 @@ CCUDADevice::CCUDADevice( cuda_interop::SCUdevice CCUDADevice::getInternalObject() const { - return cuda_native::SCUdevice(cuda_native::SAccess::native(*this).handle).asOpaque(); + return cuda_native::SCUdevice(m_native->handle); } cuda_interop::SCUcontext CCUDADevice::getContext() const { - return cuda_native::SCUcontext(cuda_native::SAccess::native(*this).context).asOpaque(); + return cuda_native::SCUcontext(m_native->context); } static bool isDeviceLocal(CUmemLocationType location) @@ -121,11 +130,9 @@ static bool isDeviceLocal(CUmemLocationType location) return location==CU_MEM_LOCATION_TYPE_DEVICE; } -static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +static CUresult reserveAddressAndMapMemory(const CCUDAHandler& handler, CUdevice nativeDevice, CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) { - const auto handler = device.getHandler(); - const auto& native = cuda_native::SAccess::native(device); - const auto& cu = handler->getCUDAFunctionTable(); + const auto& cu = handler.getCUDAFunctionTable(); CUdeviceptr ptr = 0; if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) @@ -133,19 +140,19 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)); + cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size)); return err; } CUmemAccessDesc accessDesc = { - .location = { .type = location, .id = native.handle }, + .location = { .type = location, .id = nativeDevice }, .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, }; if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(ptr, size)); - cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(ptr, size)); + cuda_native::defaultHandleResult(handler, cu.pcuMemUnmap(ptr, size)); + cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size)); return err; } @@ -157,8 +164,6 @@ static CUresult reserveAddressAndMapMemory(const CCUDADevice& device, CUdevicept core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&& inParams) { const auto handler = getHandler(); - auto& native = cuda_native::SAccess::native(*this); - auto logger = cuda_native::SAccess::logger(*this); const auto location = static_cast(inParams.locationType); CCUDAExportableMemory::SCachedCreationParams params = { @@ -180,37 +185,37 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = cuda_native::SAccess::allocationHandleType(), - .location = { .type = location, .id = native.handle }, + .requestedHandleTypes = getAllocationHandleType(), + .location = { .type = location, .id = m_native->handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; - auto nativeState = cuda_native::SAccess::makeExportableMemoryNativeState(); + auto nativeState = std::make_unique(); CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) { - logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); + m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); return nullptr; } if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { - logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); + m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)); return nullptr; } - if (const auto err = reserveAddressAndMapMemory(*this,&cuda_native::SAccess::deviceptr(*nativeState), params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err) { - logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); + m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)); if (!CloseExternalHandle(params.externalHandle)) - logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); + m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; } @@ -218,14 +223,14 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { cuda_native::defaultHandleResult(*handler, err); - cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize)); - cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(cuda_native::SAccess::deviceptr(*nativeState), params.granularSize)); + cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(nativeState->ptr, params.granularSize)); + cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(nativeState->ptr, params.granularSize)); if (!CloseExternalHandle(params.externalHandle)) - logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); + m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; } - return cuda_native::SAccess::makeExportableMemory(core::smart_refctd_ptr(this),std::move(params),std::move(nativeState)); + return CCUDAExportableMemory::create(core::smart_refctd_ptr(this),std::move(params),std::move(nativeState)); } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 2696fe1ebd..929453b3bd 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -59,13 +59,13 @@ CCUDAExportableMemory::~CCUDAExportableMemory() cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)); if (!CloseExternalHandle(m_params.externalHandle)) - cuda_native::SAccess::logger(*m_device).log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); + m_device->getHandler()->getLogger().log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); } cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const { - return cuda_native::SCUdeviceptr(m_native->ptr).asOpaque(); + return cuda_native::SCUdeviceptr(m_native->ptr); } } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 9e40942914..ce9a8aa46b 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -340,19 +340,19 @@ CCUDAHandler::CCUDAHandler( CCUDAHandler::~CCUDAHandler() = default; -uint32_t CCUDAHandler::getBuildCUDAVersion() +uint32_t CCUDAHandler::getBuildCUDASDKVersion() { return CUDA_VERSION; } const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const { - return cuda_native::SAccess::native(*this).cuda; + return m_native->cuda; } const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const { - return cuda_native::SAccess::native(*this).nvrtc; + return m_native->nvrtc; } namespace cuda_native @@ -726,21 +726,23 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) bool defaultHandleResult(const CCUDAHandler& handler, CUresult result) { - return defaultHandleResult(result,SAccess::logger(handler)); + return defaultHandleResult(result,handler.getLogger()); } bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) { + const auto& nvrtc = handler.getNVRTCFunctionTable(); + const auto logger = handler.getLogger(); switch (result) { case NVRTC_SUCCESS: return true; break; default: - if (SAccess::native(handler).nvrtc.pnvrtcGetErrorString) - SAccess::logger(handler).log("%s\n",system::ILogger::ELL_ERROR,SAccess::native(handler).nvrtc.pnvrtcGetErrorString(result)); + if (nvrtc.pnvrtcGetErrorString) + logger.log("%s\n",system::ILogger::ELL_ERROR,nvrtc.pnvrtcGetErrorString(result)); else - SAccess::logger(handler).log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); + logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); break; } _NBL_DEBUG_BREAK_IF(true); @@ -886,31 +888,33 @@ nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string #else #error "Unsuported Platform" #endif - return SAccess::native(handler).nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); + return handler.getNVRTCFunctionTable().pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); } nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) { - return SAccess::native(handler).nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); + return handler.getNVRTCFunctionTable().pnvrtcCompileProgram(prog,options.size(),options.begin()); } nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) { size_t _size = 0ull; - nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetProgramLogSize(prog, &_size); + const auto& nvrtc = handler.getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(prog, &_size); if (sizeRes != NVRTC_SUCCESS) return sizeRes; if (_size == 0ull) return NVRTC_ERROR_INVALID_INPUT; log.resize(_size); - return SAccess::native(handler).nvrtc.pnvrtcGetProgramLog(prog,log.data()); + return nvrtc.pnvrtcGetProgramLog(prog,log.data()); } SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog) { size_t _size = 0ull; - nvrtcResult sizeRes = SAccess::native(handler).nvrtc.pnvrtcGetPTXSize(prog,&_size); + const auto& nvrtc = handler.getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(prog,&_size); if (sizeRes!=NVRTC_SUCCESS) return {nullptr,sizeRes}; if (_size==0ull) @@ -920,7 +924,7 @@ SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog) ptxParams.size = _size; auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); auto ptxPtr = static_cast(ptx->getPointer()); - return {std::move(ptx),SAccess::native(handler).nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; + return {std::move(ptx),nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } static const core::vector& getDefaultRuntimeIncludeOptions() @@ -962,7 +966,7 @@ SPTXResult compileDirectlyToPTX( auto cleanup = core::makeRAIIExiter([&]() -> void { if (program) - SAccess::native(handler).nvrtc.pnvrtcDestroyProgram(&program); + handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program); }); result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); @@ -1095,7 +1099,7 @@ CCUDAHandler::CCUDAHandler( CCUDAHandler::~CCUDAHandler() = default; -uint32_t CCUDAHandler::getBuildCUDAVersion() +uint32_t CCUDAHandler::getBuildCUDASDKVersion() { return 0u; } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index cff48931c0..8ccad3e119 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -20,7 +20,7 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const { - return cuda_native::SCUexternalMemory(m_native->handle).asOpaque(); + return cuda_native::SCUexternalMemory(m_native->handle); } bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const @@ -38,7 +38,7 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result)) return false; - *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer).asOpaque(); + *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer); return true; } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 6a43fefc4c..d495f979ab 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -19,7 +19,7 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptrhandle).asOpaque(); + return cuda_native::SCUexternalSemaphore(m_native->handle); } CCUDAImportedSemaphore::~CCUDAImportedSemaphore() diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp index 743bd10c3e..3a1500e77e 100644 --- a/src/nbl/video/CUDAInteropNativeState.hpp +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -60,51 +60,6 @@ struct CCUDAImportedSemaphore::SNativeState {} }; -namespace cuda_native -{ - -struct SAccess -{ - static CCUDAHandler::SNativeState& native(CCUDAHandler& handler) { return *handler.m_native; } - static const CCUDAHandler::SNativeState& native(const CCUDAHandler& handler) { return *handler.m_native; } - - static CCUDADevice::SNativeState& native(CCUDADevice& device) { return *device.m_native; } - static const CCUDADevice::SNativeState& native(const CCUDADevice& device) { return *device.m_native; } - - static CCUDAExportableMemory::SNativeState& native(CCUDAExportableMemory& memory) { return *memory.m_native; } - static const CCUDAExportableMemory::SNativeState& native(const CCUDAExportableMemory& memory) { return *memory.m_native; } - static std::unique_ptr makeExportableMemoryNativeState() - { - return std::unique_ptr(new CCUDAExportableMemory::SNativeState()); - } - static CUdeviceptr& deviceptr(CCUDAExportableMemory::SNativeState& nativeState) { return nativeState.ptr; } - static core::smart_refctd_ptr makeExportableMemory(core::smart_refctd_ptr device, CCUDAExportableMemory::SCachedCreationParams&& params, std::unique_ptr&& nativeState) - { - return CCUDAExportableMemory::create(std::move(device),std::move(params),std::move(nativeState)); - } - - static CCUDAImportedMemory::SNativeState& native(CCUDAImportedMemory& memory) { return *memory.m_native; } - static const CCUDAImportedMemory::SNativeState& native(const CCUDAImportedMemory& memory) { return *memory.m_native; } - - static CCUDAImportedSemaphore::SNativeState& native(CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; } - static const CCUDAImportedSemaphore::SNativeState& native(const CCUDAImportedSemaphore& semaphore) { return *semaphore.m_native; } - - static system::logger_opt_ptr logger(const CCUDAHandler& handler) { return handler.m_logger.get().get(); } - static system::logger_opt_ptr logger(const CCUDADevice& device) { return device.m_logger; } - static const CCUDADevice* device(const CCUDAImportedMemory& memory) { return memory.m_device.get(); } - static IDeviceMemoryAllocation* source(const CCUDAImportedMemory& memory) { return memory.m_src.get(); } - static CUmemAllocationHandleType allocationHandleType() - { - #ifdef _WIN32 - return CU_MEM_HANDLE_TYPE_WIN32; - #else - return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - #endif - } -}; - -} - } #endif From ed8a1d6eb3bfbdec6641410a033ab079169f3265 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 12:05:03 +0200 Subject: [PATCH 115/149] Refine CUDA interop boundary --- CMakeLists.txt | 2 +- examples_tests | 2 +- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 172 +----- include/nbl/video/CCUDADevice.h | 2 + include/nbl/video/CCUDAHandler.h | 12 + include/nbl/video/CUDAInteropHandles.h | 4 - include/nbl/video/CUDAInteropNativeAPI.h | 191 +++++++ src/nbl/ext/CUDAInterop/README.md | 14 +- src/nbl/video/CCUDADevice.cpp | 77 +-- src/nbl/video/CCUDAExportableMemory.cpp | 2 +- src/nbl/video/CCUDAHandler.cpp | 489 ++++++++++++++---- src/nbl/video/CCUDAImportedMemory.cpp | 34 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 2 +- src/nbl/video/CUDAInteropNativeState.hpp | 24 +- 14 files changed, 678 insertions(+), 349 deletions(-) create mode 100644 include/nbl/video/CUDAInteropNativeAPI.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9251a3ee68..97ece5d9f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,7 +70,7 @@ else() message(STATUS "Vulkan SDK is not found") endif() -option(NBL_COMPILE_WITH_CUDA "Build the CUDA interop extension?" OFF) +option(NBL_COMPILE_WITH_CUDA "Build CUDA interop support?" OFF) set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON") if(NBL_COMPILE_WITH_CUDA) diff --git a/examples_tests b/examples_tests index e289ee14f5..d373d313d3 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit e289ee14f5b8f05004726e6f03c81a9a2e768219 +Subproject commit d373d313d3e70579d650c7804af8a2785cfede9a diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 0e08fb2b97..538645ce3d 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -35,159 +35,10 @@ */ #ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ #define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ -#include -#include "nbl/video/CUDAInterop.h" -#include "nbl/asset/ICPUBuffer.h" -#include "nbl/system/DynamicFunctionCaller.h" -#include "cuda.h" -#include "nvrtc.h" +#include "nbl/video/CUDAInteropNativeAPI.h" namespace nbl::video::cuda_native { -inline constexpr int MinimumCUDADriverVersion = 13000; -inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; -static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher."); - -/* - The CUDA/NVRTC table classes below contain the calls used and tested by Nabla's interop implementation. - After including this SDK opt-in header, consumer code can load extra Driver API or NVRTC symbols from the - same loaded libraries without changing Nabla's ABI: - - auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); - - The requested symbol must be declared by the CUDA SDK headers visible to this translation unit because the - helper uses decltype(cuNewCall) to preserve the native function signature. -*/ -using LibLoader = system::DefaultFuncPtrLoader; - -NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v4 - ,cuDevicePrimaryCtxRetain - ,cuDevicePrimaryCtxRelease - ,cuDevicePrimaryCtxSetFlags - ,cuDevicePrimaryCtxGetState - ,cuCtxDestroy_v2 - ,cuCtxEnablePeerAccess - ,cuCtxGetApiVersion - ,cuCtxGetCurrent - ,cuCtxGetDevice - ,cuCtxGetSharedMemConfig - ,cuCtxPopCurrent_v2 - ,cuCtxPushCurrent_v2 - ,cuCtxSetCacheConfig - ,cuCtxSetCurrent - ,cuCtxSetSharedMemConfig - ,cuCtxSynchronize - ,cuDeviceComputeCapability - ,cuDeviceCanAccessPeer - ,cuDeviceGetCount - ,cuDeviceGet - ,cuDeviceGetAttribute - ,cuDeviceGetLuid - ,cuDeviceGetUuid_v2 - ,cuDeviceTotalMem_v2 - ,cuDeviceGetName - ,cuDriverGetVersion - ,cuEventCreate - ,cuEventDestroy_v2 - ,cuEventElapsedTime - ,cuEventQuery - ,cuEventRecord - ,cuEventSynchronize - ,cuFuncGetAttribute - ,cuFuncSetCacheConfig - ,cuGetErrorName - ,cuGetErrorString - ,cuGraphicsMapResources - ,cuGraphicsResourceGetMappedPointer_v2 - ,cuGraphicsResourceGetMappedMipmappedArray - ,cuGraphicsSubResourceGetMappedArray - ,cuGraphicsUnmapResources - ,cuGraphicsUnregisterResource - ,cuInit - ,cuLaunchKernel - ,cuMemAlloc_v2 - ,cuMemcpyDtoD_v2 - ,cuMemcpyDtoH_v2 - ,cuMemcpyHtoD_v2 - ,cuMemcpyDtoDAsync_v2 - ,cuMemcpyDtoHAsync_v2 - ,cuMemcpyHtoDAsync_v2 - ,cuMemGetAddressRange_v2 - ,cuMemFree_v2 - ,cuMemFreeHost - ,cuMemGetInfo_v2 - ,cuMemHostAlloc - ,cuMemHostRegister_v2 - ,cuMemHostUnregister - ,cuMemsetD32_v2 - ,cuMemsetD32Async - ,cuMemsetD8_v2 - ,cuMemsetD8Async - ,cuModuleGetFunction - ,cuModuleGetGlobal_v2 - ,cuModuleLoadDataEx - ,cuModuleLoadFatBinary - ,cuModuleUnload - ,cuOccupancyMaxActiveBlocksPerMultiprocessor - ,cuPointerGetAttribute - ,cuStreamAddCallback - ,cuStreamCreate - ,cuStreamDestroy_v2 - ,cuStreamQuery - ,cuStreamSynchronize - ,cuStreamWaitEvent - ,cuSurfObjectCreate - ,cuSurfObjectDestroy - ,cuTexObjectCreate - ,cuTexObjectDestroy - ,cuImportExternalMemory - ,cuDestroyExternalMemory - ,cuExternalMemoryGetMappedBuffer - ,cuMemUnmap - ,cuMemAddressFree - ,cuMemGetAllocationGranularity - ,cuMemAddressReserve - ,cuMemCreate - ,cuMemExportToShareableHandle - ,cuMemMap - ,cuMemRelease - ,cuMemSetAccess - ,cuMemImportFromShareableHandle - ,cuLaunchHostFunc - ,cuDestroyExternalSemaphore - ,cuImportExternalSemaphore - ,cuSignalExternalSemaphoresAsync - ,cuWaitExternalSemaphoresAsync - ,cuLogsRegisterCallback -); - -NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, - nvrtcGetErrorString, - nvrtcVersion, - nvrtcAddNameExpression, - nvrtcCompileProgram, - nvrtcCreateProgram, - nvrtcDestroyProgram, - nvrtcGetLoweredName, - nvrtcGetPTX, - nvrtcGetPTXSize, - nvrtcGetProgramLog, - nvrtcGetProgramLogSize -); - -struct SCUDADeviceInfo -{ - CUdevice handle = {}; - CUuuid uuid = {}; -}; - -struct SPTXResult -{ - core::smart_refctd_ptr ptx; - nvrtcResult result; -}; - /* CUDA SDK view of an SDK-free opaque handle. @@ -215,27 +66,6 @@ inline bool isBuildCUDASDKVersionExactMatch() return buildVersion==0u || buildVersion==CUDA_VERSION; } -/* - Nabla interop API declarations with CUDA SDK signatures. - - These declarations belong to the Nabla interop API. They live behind Nabla::ext::CUDAInterop because their - signatures mention CUDA/NVRTC SDK types directly. Keeping them out of nbl/video/CCUDA*.h means Nabla's public - API can be parsed and packaged without CUDA SDK headers. Nabla still owns the exported glue definitions. - Consumers accept this SDK-typed API surface only by including this header and linking the explicit interop - target. -*/ -NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); -NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); -NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); -NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); -NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); -NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); -NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); -NBL_API2 SPTXResult compileDirectlyToPTX( - CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr -); - } #endif diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 4658e51a10..56e81d4b2f 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -82,12 +82,14 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted struct SNativeState; CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); + bool isValid() const; const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; std::array m_allocationGranularity = {}; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + bool m_valid = false; core::smart_refctd_ptr m_handler; std::unique_ptr m_native; diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index a77ab66b68..e69792b217 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -31,9 +31,18 @@ namespace cuda_interop { inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json"; +struct SRuntimeIncludeDir +{ + system::path path; + std::string source; + uint32_t cudaVersion = 0u; + bool completeRuntimeHeaderSet = false; +}; + struct SRuntimeCompileEnvironment { core::vector includeDirs; + core::vector includeDirInfos; }; NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs = {}); @@ -52,8 +61,11 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted public: static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); static uint32_t getBuildCUDASDKVersion(); + uint32_t getLoadedCUDADriverVersion() const; + std::array getLoadedNVRTCVersion() const; const cuda_native::CUDA& getCUDAFunctionTable() const; const cuda_native::NVRTC& getNVRTCFunctionTable() const; + core::SRange getDefaultRuntimeIncludeOptions() const; inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); } inline core::SRange getSTDHeaders() diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index 92888d3ccf..987a130ad1 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -65,10 +65,6 @@ struct SNativeHandle operator Opaque&() { return reinterpret_cast(value); } operator const Opaque&() const { return reinterpret_cast(value); } - Opaque* opaque() { return &static_cast(*this); } - const Opaque* opaque() const { return &static_cast(*this); } - Opaque asOpaque() const { return static_cast(*this); } - cuda_t value = {}; }; diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h new file mode 100644 index 0000000000..52dad41f09 --- /dev/null +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -0,0 +1,191 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ + +#include + +#include "nbl/video/CUDAInterop.h" +#include "nbl/asset/ICPUBuffer.h" +#include "nbl/system/DynamicFunctionCaller.h" + +#include "cuda.h" +#include "nvrtc.h" + +namespace nbl::video::cuda_native +{ + +inline constexpr int MinimumCUDADriverVersion = 13000; +inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; +static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher."); + +/* + Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users. + + This file lives under include/ because it is shared with nbl/ext/CUDAInterop/CUDAInteropNative.h, the public + opt-in header for consumers that explicitly accept CUDA SDK types. Its physical location does not make it part + of the default Nabla public interface: nbl/video/CCUDA*.h headers, Nabla::Nabla public requirements, and PCH + do not include it, so normal Nabla consumers do not need cuda.h or nvrtc.h. + + The declarations below intentionally use CUDA/NVRTC SDK types because they describe the SDK-typed glue between + raw CUDA code and Nabla's exported CUDA interop objects: dynamic function tables, NVRTC helpers, error handling, + and runtime header discovery integration. Consumers enter this surface only by linking Nabla::ext::CUDAInterop + and including nbl/ext/CUDAInterop/CUDAInteropNative.h. +*/ +using LibLoader = system::DefaultFuncPtrLoader; + +/* + The CUDA/NVRTC table classes contain the calls used and tested by Nabla's interop implementation. SDK opt-in + consumers can load additional Driver API or NVRTC symbols from the same table without changing Nabla's ABI: + + auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); + + The requested symbol must be declared by the CUDA SDK visible to that translation unit because the helper uses + decltype(cuNewCall) to preserve the native function signature. +*/ +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader + ,cuCtxCreate_v4 + ,cuDevicePrimaryCtxRetain + ,cuDevicePrimaryCtxRelease + ,cuDevicePrimaryCtxSetFlags + ,cuDevicePrimaryCtxGetState + ,cuCtxDestroy_v2 + ,cuCtxEnablePeerAccess + ,cuCtxGetApiVersion + ,cuCtxGetCurrent + ,cuCtxGetDevice + ,cuCtxGetSharedMemConfig + ,cuCtxPopCurrent_v2 + ,cuCtxPushCurrent_v2 + ,cuCtxSetCacheConfig + ,cuCtxSetCurrent + ,cuCtxSetSharedMemConfig + ,cuCtxSynchronize + ,cuDeviceComputeCapability + ,cuDeviceCanAccessPeer + ,cuDeviceGetCount + ,cuDeviceGet + ,cuDeviceGetAttribute + ,cuDeviceGetLuid + ,cuDeviceGetUuid_v2 + ,cuDeviceTotalMem_v2 + ,cuDeviceGetName + ,cuDriverGetVersion + ,cuEventCreate + ,cuEventDestroy_v2 + ,cuEventElapsedTime + ,cuEventQuery + ,cuEventRecord + ,cuEventSynchronize + ,cuFuncGetAttribute + ,cuFuncSetCacheConfig + ,cuGetErrorName + ,cuGetErrorString + ,cuGraphicsMapResources + ,cuGraphicsResourceGetMappedPointer_v2 + ,cuGraphicsResourceGetMappedMipmappedArray + ,cuGraphicsSubResourceGetMappedArray + ,cuGraphicsUnmapResources + ,cuGraphicsUnregisterResource + ,cuInit + ,cuLaunchKernel + ,cuMemAlloc_v2 + ,cuMemcpyDtoD_v2 + ,cuMemcpyDtoH_v2 + ,cuMemcpyHtoD_v2 + ,cuMemcpyDtoDAsync_v2 + ,cuMemcpyDtoHAsync_v2 + ,cuMemcpyHtoDAsync_v2 + ,cuMemGetAddressRange_v2 + ,cuMemFree_v2 + ,cuMemFreeHost + ,cuMemGetInfo_v2 + ,cuMemHostAlloc + ,cuMemHostRegister_v2 + ,cuMemHostUnregister + ,cuMemsetD32_v2 + ,cuMemsetD32Async + ,cuMemsetD8_v2 + ,cuMemsetD8Async + ,cuModuleGetFunction + ,cuModuleGetGlobal_v2 + ,cuModuleLoadDataEx + ,cuModuleLoadFatBinary + ,cuModuleUnload + ,cuOccupancyMaxActiveBlocksPerMultiprocessor + ,cuPointerGetAttribute + ,cuStreamAddCallback + ,cuStreamCreate + ,cuStreamDestroy_v2 + ,cuStreamQuery + ,cuStreamSynchronize + ,cuStreamWaitEvent + ,cuSurfObjectCreate + ,cuSurfObjectDestroy + ,cuTexObjectCreate + ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback +); + +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, + nvrtcGetErrorString, + nvrtcVersion, + nvrtcAddNameExpression, + nvrtcCompileProgram, + nvrtcCreateProgram, + nvrtcDestroyProgram, + nvrtcGetLoweredName, + nvrtcGetPTX, + nvrtcGetPTXSize, + nvrtcGetProgramLog, + nvrtcGetProgramLogSize +); + +struct SPTXResult +{ + core::smart_refctd_ptr ptx; + nvrtcResult result; +}; + +/* + Exported Nabla glue declarations with CUDA SDK signatures. + + These are not a CUDA wrapper. They are the small boundary surface used for error handling, NVRTC helpers, + runtime header discovery integration, and dynamic CUDA/NVRTC table access. Nabla owns the definitions. + The signatures mention CUDA SDK types, so they are intentionally unavailable to consumers that only parse + SDK-free nbl/video/CCUDA*.h headers. +*/ +NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); +NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); +NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); +NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); +NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); +NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); +NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); +NBL_API2 SPTXResult compileDirectlyToPTX( + CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr +); + +} + +#endif diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 231658e949..5677db046f 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -6,7 +6,8 @@ - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers. - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`. - `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. -- `CUDAInteropNative.h` is the opt-in SDK boundary. It includes CUDA SDK headers and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`. +- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and exported Nabla glue functions whose signatures use CUDA SDK types. +- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`. ## CMake Usage @@ -89,7 +90,7 @@ auto compile = nbl::video::cuda_native::compileDirectlyToPTX( SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop: - `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in. -- The default tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit: +- The shipped tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit: ```cpp auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); @@ -115,7 +116,7 @@ Smoke examples: - Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. - Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer. - CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. -- SDK-sized arrays, CUDA enum storage, and CUDA implementation headers stay private to Nabla. +- SDK-sized arrays, CUDA enum storage, and CUDA implementation state stay private to Nabla. - A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDASDKVersionExactMatch()` when exact CUDA SDK version matching is required. - Runtime include-option construction is header-only and is not part of the exported ABI. - The loaded CUDA driver and NVRTC runtime are validated at runtime. @@ -129,8 +130,11 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud - Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. - `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. - Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. +- Runtime lookup records the source of every accepted include root and parses `CUDA_VERSION` from `cuda.h` when available. The startup report prints the primary include root, its source, its parsed CUDA version, and the full search order. +- The first include root is not required to match the SDK used to build Nabla. It is the first `-I` path visible to NVRTC, so the first path containing a requested header wins just like normal C/C++ include search. +- If the primary runtime header root is incomplete or reports a different CUDA version than the loaded NVRTC runtime, Nabla logs a warning. This is diagnostic policy, not an automatic hard failure. - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. -- `cuda_native::compileDirectlyToPTX` appends discovered include directories to NVRTC options. Discovery is cached after the first call. +- `CCUDAHandler` captures discovered include directories when it is created. `cuda_native::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler. Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. @@ -157,7 +161,7 @@ CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On ## CUDA ON/OFF Builds - SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds. -- CUDA implementation headers and SDK includes stay behind `_NBL_COMPILE_WITH_CUDA_`. +- Nabla implementation `.cpp` files include CUDA SDK headers only behind `_NBL_COMPILE_WITH_CUDA_`. - CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. - The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds. diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 1c73068a6d..0178f31fc7 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -67,15 +67,15 @@ static CUmemAllocationHandleType getAllocationHandleType() } CCUDADevice::CCUDADevice( - core::smart_refctd_ptr&& vulkanConnection, - IPhysicalDevice* const vulkanDevice, + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, - core::smart_refctd_ptr&& handler) : + core::smart_refctd_ptr&& handler) : m_logger(vulkanDevice->getDebugCallback()->getLogger()), - m_defaultCompileOptions(), - m_vulkanConnection(std::move(vulkanConnection)), - m_virtualArchitecture(virtualArchitecture), + m_defaultCompileOptions(), + m_vulkanConnection(std::move(vulkanConnection)), + m_virtualArchitecture(virtualArchitecture), m_handler(std::move(handler)), m_native(std::move(nativeState)) { @@ -86,43 +86,43 @@ CCUDADevice::CCUDADevice( m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - const auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = m_handler->getCUDAFunctionTable(); if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) - assert(false); + return; if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context))) - assert(false); + return; for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { - - #ifdef _WIN32 - OBJECT_ATTRIBUTES metadata = { - .Length = sizeof(OBJECT_ATTRIBUTES) - }; - #endif - - const auto prop = CUmemAllocationProp{ - .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = getAllocationHandleType(), - .location = { .type = static_cast(locationType), .id = m_native->handle }, - #ifdef _WIN32 - .win32HandleMetaData = &metadata, - #endif - }; +#ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; +#endif + + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = getAllocationHandleType(), + .location = { .type = static_cast(locationType), .id = m_native->handle }, +#ifdef _WIN32 + .win32HandleMetaData = &metadata, +#endif + }; if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) - assert(false); + return; } + m_valid = true; } cuda_interop::SCUdevice CCUDADevice::getInternalObject() const { - return cuda_native::SCUdevice(m_native->handle); + return cuda_interop::SNativeHandle(m_native->handle); } cuda_interop::SCUcontext CCUDADevice::getContext() const { - return cuda_native::SCUcontext(m_native->context); + return cuda_interop::SNativeHandle(m_native->context); } static bool isDeviceLocal(CUmemLocationType location) @@ -176,14 +176,14 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor return nullptr; auto& cu = handler->getCUDAFunctionTable(); - + #ifdef _WIN32 OBJECT_ATTRIBUTES metadata = { - .Length = sizeof(OBJECT_ATTRIBUTES) + .Length = sizeof(OBJECT_ATTRIBUTES) }; #endif - const auto prop = CUmemAllocationProp{ + const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, .requestedHandleTypes = getAllocationHandleType(), .location = { .type = location, .id = m_native->handle }, @@ -275,10 +275,9 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { #ifdef _WIN32 .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, - // TODO(kevinyu): Fix this later. Make it compile first. .handle = {.win32 = {.handle = sema->getExternalHandle() }}, #else - .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, .handle = {.fd = sema->getExternalHandle()} #endif }; @@ -299,7 +298,13 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CCUDADevice::~CCUDADevice() { - cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)); + if (m_native->context) + cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)); +} + +bool CCUDADevice::isValid() const +{ + return m_valid; } } @@ -321,6 +326,7 @@ CCUDADevice::CCUDADevice( : m_logger(nullptr) , m_vulkanConnection(std::move(vulkanConnection)) , m_virtualArchitecture(virtualArchitecture) + , m_valid(false) , m_handler(std::move(handler)) , m_native(std::move(nativeState)) { @@ -329,6 +335,11 @@ CCUDADevice::CCUDADevice( CCUDADevice::~CCUDADevice() = default; +bool CCUDADevice::isValid() const +{ + return false; +} + cuda_interop::SCUdevice CCUDADevice::getInternalObject() const { return {}; diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 929453b3bd..722c958b68 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -65,7 +65,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory() cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const { - return cuda_native::SCUdeviceptr(m_native->ptr); + return cuda_interop::SNativeHandle(m_native->ptr); } } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index ce9a8aa46b..6d8b2ffb70 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include namespace nbl::video::cuda_interop @@ -62,28 +63,71 @@ bool looksLikeCUDAIncludeDir(const system::path& path) isRegularFile(path/"nv"/"target"); } -void appendIncludeDir(core::vector& includeDirs, system::path path) +uint32_t readCUDAVersion(const system::path& includeDir) +{ + std::ifstream input(includeDir/"cuda.h"); + if (!input) + return 0u; + + std::string line; + while (std::getline(input,line)) + { + std::istringstream stream(line); + std::string directive; + stream >> directive; + if (directive!="#define") + continue; + + std::string name; + stream >> name; + if (name!="CUDA_VERSION") + continue; + + uint32_t version = 0u; + if (stream >> version) + return version; + } + return 0u; +} + +bool looksLikeCompleteRuntimeHeaderSet(const system::path& includeDir) +{ + return isRegularFile(includeDir/"cuda.h") && + isRegularFile(includeDir/"cuda_runtime_api.h") && + isRegularFile(includeDir/"vector_types.h"); +} + +void appendIncludeDir(SRuntimeCompileEnvironment& environment, system::path path, std::string source) { if (path.empty() || !looksLikeCUDAIncludeDir(path)) return; path = normalizedAbsolute(std::move(path)); const auto pathString = path.generic_string(); - const auto alreadyAdded = std::find_if(includeDirs.begin(),includeDirs.end(),[&](const system::path& existing) { + const auto alreadyAdded = std::find_if(environment.includeDirs.begin(),environment.includeDirs.end(),[&](const system::path& existing) { return existing.generic_string()==pathString; }); - if (alreadyAdded==includeDirs.end()) - includeDirs.push_back(std::move(path)); + if (alreadyAdded==environment.includeDirs.end()) + { + SRuntimeIncludeDir info; + info.path = path; + info.source = std::move(source); + info.cudaVersion = readCUDAVersion(path); + info.completeRuntimeHeaderSet = looksLikeCompleteRuntimeHeaderSet(path); + + environment.includeDirs.push_back(std::move(path)); + environment.includeDirInfos.push_back(std::move(info)); + } } -void appendCUDAIncludeDirsBelow(core::vector& includeDirs, const system::path& root, uint32_t maxDepth) +void appendCUDAIncludeDirsBelow(SRuntimeCompileEnvironment& environment, const system::path& root, uint32_t maxDepth, std::string source) { if (!isDirectory(root)) return; if (looksLikeCUDAIncludeDir(root)) { - appendIncludeDir(includeDirs,root); + appendIncludeDir(environment,root,std::move(source)); return; } if (maxDepth==0u) @@ -106,19 +150,19 @@ void appendCUDAIncludeDirsBelow(core::vector& includeDirs, const s return lhs.generic_string()>rhs.generic_string(); }); for (const auto& candidate : candidates) - appendCUDAIncludeDirsBelow(includeDirs,candidate,maxDepth-1u); + appendCUDAIncludeDirsBelow(environment,candidate,maxDepth-1u,source); } -void appendCUDAIncludeRoot(core::vector& includeDirs, const system::path& root) +void appendCUDAIncludeRoot(SRuntimeCompileEnvironment& environment, const system::path& root, std::string source) { if (root.empty()) return; - appendIncludeDir(includeDirs,root); - appendIncludeDir(includeDirs,root/"include"); + appendIncludeDir(environment,root,source); + appendIncludeDir(environment,root/"include",std::move(source)); } -void appendRuntimePathsConfig(core::vector& includeDirs, const system::path& configFile) +void appendRuntimePathsConfig(SRuntimeCompileEnvironment& environment, const system::path& configFile, const char* source) { if (!isRegularFile(configFile)) return; @@ -137,7 +181,7 @@ void appendRuntimePathsConfig(core::vector& includeDirs, const sys for (const auto& path : *paths) if (path.is_string()) - appendIncludeDir(includeDirs,system::path(path.get())); + appendIncludeDir(environment,system::path(path.get()),std::string(source)+": "+configFile.generic_string()); } template @@ -160,60 +204,66 @@ void appendPathListEnv(const char* name, Append append) } } -void appendRuntimePathsConfigs(core::vector& includeDirs, const core::vector& explicitRuntimePathFiles) +void appendRuntimePathsConfigs(SRuntimeCompileEnvironment& environment, const core::vector& explicitRuntimePathFiles) { for (const auto& runtimePathFile : explicitRuntimePathFiles) - appendRuntimePathsConfig(includeDirs,runtimePathFile); + appendRuntimePathsConfig(environment,runtimePathFile,"explicit runtime JSON"); - const auto appendConfig = [&](const system::path& path) { appendRuntimePathsConfig(includeDirs,path); }; - appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",appendConfig); - appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",appendConfig); + appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) { + appendRuntimePathsConfig(environment,path,"NBL_CUDA_INTEROP_RUNTIME_JSON"); + }); + appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) { + appendRuntimePathsConfig(environment,path,"Nabla_CUDA_INTEROP_RUNTIME_JSON"); + }); const auto exeDir = system::executableDirectory(); if (!exeDir.empty()) - appendRuntimePathsConfig(includeDirs,exeDir/RuntimePathsFileName); + appendRuntimePathsConfig(environment,exeDir/RuntimePathsFileName,"executable-local runtime JSON"); } -void appendAppLocalIncludeDirs(core::vector& includeDirs) +void appendAppLocalIncludeDirs(SRuntimeCompileEnvironment& environment) { const auto exeDir = system::executableDirectory(); if (exeDir.empty()) return; - appendIncludeDir(includeDirs,exeDir/"cuda"/"include"); - appendCUDAIncludeDirsBelow(includeDirs,exeDir/"nvidia",4u); - appendIncludeDir(includeDirs,exeDir/"Libraries"/"cuda"/"include"); - appendIncludeDir(includeDirs,exeDir.parent_path()/"cuda"/"include"); - appendCUDAIncludeDirsBelow(includeDirs,exeDir.parent_path()/"nvidia",4u); + appendIncludeDir(environment,exeDir/"cuda"/"include","app-local cuda/include"); + appendCUDAIncludeDirsBelow(environment,exeDir/"nvidia",4u,"app-local nvidia package"); + appendIncludeDir(environment,exeDir/"Libraries"/"cuda"/"include","app-local Libraries/cuda/include"); + appendIncludeDir(environment,exeDir.parent_path()/"cuda"/"include","parent app-local cuda/include"); + appendCUDAIncludeDirsBelow(environment,exeDir.parent_path()/"nvidia",4u,"parent app-local nvidia package"); } -void appendPythonPackageIncludeDirs(core::vector& includeDirs, const system::path& root) +void appendPythonPackageIncludeDirs(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source) { if (root.empty()) return; - appendCUDAIncludeDirsBelow(includeDirs,root/"Lib"/"site-packages"/"nvidia",4u); - appendCUDAIncludeDirsBelow(includeDirs,root/"lib"/"site-packages"/"nvidia",4u); - appendIncludeDir(includeDirs,root/"Library"/"include"); - appendIncludeDir(includeDirs,root/"include"); + appendCUDAIncludeDirsBelow(environment,root/"Lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package"); + appendCUDAIncludeDirsBelow(environment,root/"lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package"); + appendIncludeDir(environment,root/"Library"/"include",std::string(source)+" Library/include"); + appendIncludeDir(environment,root/"include",std::string(source)+" include"); } -void appendEnvironmentIncludeDirs(core::vector& includeDirs) +void appendEnvironmentIncludeDirs(SRuntimeCompileEnvironment& environment) { - const auto appendInclude = [&](const system::path& path) { appendIncludeDir(includeDirs,path); }; - appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude); - appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",appendInclude); + appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) { + appendIncludeDir(environment,path,"NBL_CUDA_RUNTIME_INCLUDE_DIRS"); + }); + appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) { + appendIncludeDir(environment,path,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS"); + }); - appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_PATH")); - appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_HOME")); - appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDA_ROOT")); - appendCUDAIncludeRoot(includeDirs,readEnvironmentVariable("CUDAToolkit_ROOT")); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_PATH"),"CUDA_PATH"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_HOME"),"CUDA_HOME"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_ROOT"),"CUDA_ROOT"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDAToolkit_ROOT"),"CUDAToolkit_ROOT"); - appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("VIRTUAL_ENV")); - appendPythonPackageIncludeDirs(includeDirs,readEnvironmentVariable("CONDA_PREFIX")); + appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("VIRTUAL_ENV"),"VIRTUAL_ENV"); + appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("CONDA_PREFIX"),"CONDA_PREFIX"); } -void appendCUDAInstallRoots(core::vector& includeDirs, const system::path& root) +void appendCUDAInstallRoots(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source) { if (!isDirectory(root)) return; @@ -233,17 +283,17 @@ void appendCUDAInstallRoots(core::vector& includeDirs, const syste return lhs.generic_string()>rhs.generic_string(); }); for (const auto& candidate : candidates) - appendIncludeDir(includeDirs,candidate); + appendIncludeDir(environment,candidate,source); } -void appendSystemIncludeDirs(core::vector& includeDirs) +void appendSystemIncludeDirs(SRuntimeCompileEnvironment& environment) { #if defined(_NBL_PLATFORM_WINDOWS_) - appendCUDAInstallRoots(includeDirs,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA"); + appendCUDAInstallRoots(environment,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA","system CUDA Toolkit install root"); #else - appendIncludeDir(includeDirs,"/usr/local/cuda/include"); - appendCUDAInstallRoots(includeDirs,"/usr/local"); - appendIncludeDir(includeDirs,"/usr/include"); + appendIncludeDir(environment,"/usr/local/cuda/include","system /usr/local/cuda"); + appendCUDAInstallRoots(environment,"/usr/local","system /usr/local CUDA install root"); + appendIncludeDir(environment,"/usr/include","system /usr/include"); #endif } @@ -252,13 +302,25 @@ void appendSystemIncludeDirs(core::vector& includeDirs) SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles) { SRuntimeCompileEnvironment environment; + + /* + Runtime header discovery builds the ordered include list passed to NVRTC. It is not a lock to the CUDA SDK + used to build Nabla. A packaged Nabla must stay relocatable, so host-specific include paths are accepted + only when the application provides them at runtime: direct arguments, JSON next to the executable, an + override JSON, app-local header bundles, environment variables, or finally common toolkit install roots. + + The first root containing a requested header wins exactly like normal C/C++ include search. Keep every + accepted root with its source and parsed CUDA_VERSION so startup logs can explain what NVRTC will see. + This is also why mismatched or partial roots produce diagnostics instead of changing discovery order or + hard-failing before the user kernel is compiled. + */ for (auto& includeDir : explicitIncludeDirs) - appendIncludeDir(environment.includeDirs,std::move(includeDir)); + appendIncludeDir(environment,std::move(includeDir),"explicit include dir"); - appendRuntimePathsConfigs(environment.includeDirs,runtimePathFiles); - appendAppLocalIncludeDirs(environment.includeDirs); - appendEnvironmentIncludeDirs(environment.includeDirs); - appendSystemIncludeDirs(environment.includeDirs); + appendRuntimePathsConfigs(environment,runtimePathFiles); + appendAppLocalIncludeDirs(environment); + appendEnvironmentIncludeDirs(environment); + appendSystemIncludeDirs(environment); return environment; } @@ -292,6 +354,82 @@ int cudaVersionMinor(int version) return (version%1000)/10; } +int cudaVersionCode(int major, int minor) +{ + return major*1000+minor*10; +} + +system::path loadedRuntimeModulePath(const char* moduleName) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + const auto moduleDir = system::loadedModuleDirectory(moduleName); + if (moduleDir.empty()) + return {}; + return moduleDir/(std::string(moduleName)+".dll"); + #else + return {}; + #endif +} + +std::string cudaVersionString(int version) +{ + std::ostringstream stream; + stream << cudaVersionMajor(version) << "." << cudaVersionMinor(version); + return stream.str(); +} + +std::string cudaVersionString(const std::array& version) +{ + std::ostringstream stream; + stream << version[0] << "." << version[1]; + return stream.str(); +} + +std::string runtimeIncludeDirDescription(const cuda_interop::SRuntimeIncludeDir& includeDir) +{ + std::ostringstream stream; + stream << includeDir.path.generic_string() << " (" << includeDir.source; + if (includeDir.cudaVersion!=0u) + stream << ", CUDA_VERSION " << includeDir.cudaVersion << " / " << cudaVersionString(includeDir.cudaVersion); + else + stream << ", CUDA_VERSION unknown"; + if (!includeDir.completeRuntimeHeaderSet) + stream << ", partial header root"; + stream << ")"; + return stream.str(); +} + +std::string cudaRuntimeReport( + const int buildVersion, const int cudaDriverVersion, const system::path& cudaDriverPath, + const std::array& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath, + const cuda_interop::SRuntimeCompileEnvironment& runtimeEnvironment) +{ + std::ostringstream stream; + stream << "CCUDAHandler: CUDA interop runtime report:\n"; + stream << " - Nabla build CUDA SDK: " << cudaVersionString(buildVersion) << "\n"; + stream << " - CUDA Driver API: " << cudaVersionString(cudaDriverVersion); + if (!cudaDriverPath.empty()) + stream << " (" << cudaDriverPath.generic_string() << ")"; + stream << "\n"; + stream << " - NVRTC runtime: " << cudaVersionString(nvrtcVersion) << " (" << nvrtcLibraryName; + if (!nvrtcPath.empty()) + stream << ", " << nvrtcPath.generic_string(); + stream << ")\n"; + + if (runtimeEnvironment.includeDirs.empty()) + { + stream << " - NVRTC runtime header search path: none discovered"; + } + else + { + stream << " - Primary NVRTC runtime header path: " << runtimeIncludeDirDescription(runtimeEnvironment.includeDirInfos.front()) << "\n"; + stream << " - NVRTC runtime header search order (first path containing the requested header wins):\n"; + for (const auto& includeDir : runtimeEnvironment.includeDirInfos) + stream << " - " << runtimeIncludeDirDescription(includeDir) << "\n"; + } + return stream.str(); +} + } CCUDAHandler::CCUDAHandler( @@ -310,6 +448,8 @@ CCUDAHandler::CCUDAHandler( m_headerNamesStorage.push_back(header->getFileName().string()); m_headerNames.push_back(m_headerNamesStorage.back().c_str()); } + for (const auto& option : m_native->runtimeIncludeOptions) + m_native->runtimeIncludeOptionPtrs.push_back(option.c_str()); int deviceCount = 0; if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) @@ -326,9 +466,8 @@ CCUDAHandler::CCUDAHandler( continue; auto& nativeDevice = m_native->deviceStates.emplace_back(); - nativeDevice.info.handle = handle; - nativeDevice.info.uuid = uuid; - m_native->availableDevices.push_back(nativeDevice.info); + nativeDevice.handle = handle; + nativeDevice.uuid = uuid; auto& cleanDevice = m_availableDevices.emplace_back(); memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size()); @@ -345,6 +484,16 @@ uint32_t CCUDAHandler::getBuildCUDASDKVersion() return CUDA_VERSION; } +uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const +{ + return m_native->cudaDriverVersion; +} + +std::array CCUDAHandler::getLoadedNVRTCVersion() const +{ + return m_native->nvrtcVersion; +} + const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const { return m_native->cuda; @@ -355,6 +504,14 @@ const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const return m_native->nvrtc; } +core::SRange CCUDAHandler::getDefaultRuntimeIncludeOptions() const +{ + if (m_native->runtimeIncludeOptionPtrs.empty()) + return {nullptr,nullptr}; + const auto* begin = m_native->runtimeIncludeOptionPtrs.data(); + return {begin,begin+m_native->runtimeIncludeOptionPtrs.size()}; +} + namespace cuda_native { @@ -480,6 +637,11 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) This indicates that a PTX JIT compilation failed. )===",system::ILogger::ELL_ERROR); break; + case CUDA_ERROR_UNSUPPORTED_PTX_VERSION: + logger.log(R"===(CCUDAHandler: + This indicates that the PTX version is unsupported by the CUDA driver. Check that the CUDA driver runtime can consume PTX produced by the loaded NVRTC runtime. + )===",system::ILogger::ELL_ERROR); + break; case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: logger.log(R"===(CCUDAHandler: This indicates an error with OpenGL or DirectX context. @@ -717,15 +879,25 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) break; case CUDA_ERROR_UNKNOWN: default: - logger.log("CCUDAHandler: Unknown CUDA Error!\n",system::ILogger::ELL_ERROR); + logger.log("CCUDAHandler: Unknown CUDA error code %d.",system::ILogger::ELL_ERROR,static_cast(result)); break; } - _NBL_DEBUG_BREAK_IF(true); return false; } bool defaultHandleResult(const CCUDAHandler& handler, CUresult result) { + if (result==CUDA_ERROR_UNSUPPORTED_PTX_VERSION) + { + const auto cudaVersion = handler.getLoadedCUDADriverVersion(); + const auto nvrtcVersion = handler.getLoadedNVRTCVersion(); + handler.getLogger().log( + "CCUDAHandler: CUDA driver API %d.%d rejected PTX produced through NVRTC %d.%d. Install a newer NVIDIA driver or use an NVRTC/runtime-header set compatible with the installed driver.", + system::ILogger::ELL_ERROR, + cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion), + nvrtcVersion[0],nvrtcVersion[1] + ); + } return defaultHandleResult(result,handler.getLogger()); } @@ -745,7 +917,6 @@ bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); break; } - _NBL_DEBUG_BREAK_IF(true); return false; } @@ -764,39 +935,6 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste #error "Unsuported Platform" #endif ); - - cuda_native::NVRTC nvrtc = {}; - #if defined(_NBL_WINDOWS_API_) - // Perpetual TODO: any new CUDA releases we need to account for? - // Version List: https://developer.nvidia.com/cuda-toolkit-archive - const char* nvrtc64_versions[] = { - "nvrtc64_132", - "nvrtc64_131", - "nvrtc64_130", - nullptr - }; - - const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; - for (auto verpath=nvrtc64_versions; *verpath; verpath++) - { - for (auto suffix=nvrtc64_suffices; *suffix; suffix++) - { - std::string path(*verpath); - path += *suffix; - nvrtc = cuda_native::NVRTC(path.c_str()); - if (nvrtc.pnvrtcVersion) - break; - } - if (nvrtc.pnvrtcVersion) - break; - } - #elif defined(_NBL_POSIX_API_) - nvrtc = cuda_native::NVRTC("nvrtc"); - //nvrtc_builtins = NVRTC("nvrtc-builtins"); - #else - #error "Unsuported Platform" - #endif - // need a complex safe calling chain because DLL/SO might not have loaded #define SAFE_CUDA_CALL(FUNC,...) \ @@ -832,6 +970,86 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste // stop the pollution #undef SAFE_CUDA_CALL + auto readNVRTCVersion = [&](const cuda_native::NVRTC& candidate, std::array& version, const char* name) -> bool + { + if (!candidate.pnvrtcVersion) + return false; + + const auto result = candidate.pnvrtcVersion(version.data(),version.data()+1); + if (result==NVRTC_SUCCESS) + return true; + + logger.log("CCUDAHandler: nvrtcVersion failed for %s with NVRTC error code %d.",system::ILogger::ELL_WARNING,name,static_cast(result)); + version = {-1,-1}; + return false; + }; + + cuda_native::NVRTC nvrtc = {}; + std::array nvrtcVersion = {-1,-1}; + std::string nvrtcLibraryName; + + #if defined(_NBL_WINDOWS_API_) + cuda_native::NVRTC fallbackNVRTC = {}; + std::array fallbackNVRTCVersion = {-1,-1}; + std::string fallbackNVRTCLibraryName; + + /* + The CUDA driver consumes the final PTX, not the toolkit that provided headers or nvrtc*.dll. + A real machine can have an older NVIDIA driver and a newer CUDA Toolkit side by side, for example + driver API 13.1 from nvcuda.dll with CUDA 13.2 Toolkit/NVRTC in CUDA_PATH. In that setup NVRTC can + emit PTX the installed driver rejects with CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Prefer an NVRTC runtime + that is not newer than the loaded driver and log the full version matrix when no compatible one exists. + */ + const char* nvrtc64_versions[] = { + "nvrtc64_132", + "nvrtc64_131", + "nvrtc64_130", + nullptr + }; + + const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; + for (auto verpath=nvrtc64_versions; *verpath && !nvrtc.pnvrtcVersion; verpath++) + { + for (auto suffix=nvrtc64_suffices; *suffix; suffix++) + { + std::string candidateName(*verpath); + candidateName += *suffix; + + cuda_native::NVRTC candidate(candidateName.c_str()); + std::array candidateVersion = {-1,-1}; + if (!readNVRTCVersion(candidate,candidateVersion,candidateName.c_str())) + continue; + + if (cudaVersionCode(candidateVersion[0],candidateVersion[1])<=cudaVersion) + { + nvrtc = std::move(candidate); + nvrtcVersion = candidateVersion; + nvrtcLibraryName = std::move(candidateName); + break; + } + + if (!fallbackNVRTC.pnvrtcVersion) + { + fallbackNVRTC = std::move(candidate); + fallbackNVRTCVersion = candidateVersion; + fallbackNVRTCLibraryName = std::move(candidateName); + } + } + } + + if (!nvrtc.pnvrtcVersion && fallbackNVRTC.pnvrtcVersion) + { + nvrtc = std::move(fallbackNVRTC); + nvrtcVersion = fallbackNVRTCVersion; + nvrtcLibraryName = std::move(fallbackNVRTCLibraryName); + } + #elif defined(_NBL_POSIX_API_) + nvrtcLibraryName = "nvrtc"; + nvrtc = cuda_native::NVRTC(nvrtcLibraryName.c_str()); + readNVRTCVersion(nvrtc,nvrtcVersion,nvrtcLibraryName.c_str()); + #else + #error "Unsuported Platform" + #endif // check nvrtc existence and compatibility if (!nvrtc.pnvrtcVersion) @@ -839,13 +1057,6 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion); return nullptr; } - int nvrtcVersion[2] = { -1,-1 }; - const auto nvrtcVersionResult = nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1); - if (nvrtcVersionResult!=NVRTC_SUCCESS) - { - logger.log("CCUDAHandler: nvrtcVersion failed with NVRTC error code %d.",system::ILogger::ELL_ERROR,static_cast(nvrtcVersionResult)); - return nullptr; - } if (nvrtcVersion[0] CCUDAHandler::create(system::ISystem* syste return nullptr; } + const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion(); + auto runtimeEnvironment = cuda_interop::findRuntimeCompileEnvironment(); + const auto cudaDriverPath = loadedRuntimeModulePath("nvcuda"); + const auto nvrtcPath = loadedRuntimeModulePath(nvrtcLibraryName.c_str()); + const auto report = cudaRuntimeReport(buildVersion,cudaVersion,cudaDriverPath,nvrtcVersion,nvrtcLibraryName,nvrtcPath,runtimeEnvironment); + logger.log("%s",system::ILogger::ELL_INFO,report.c_str()); + + if (cudaVersionCode(nvrtcVersion[0],nvrtcVersion[1])>cudaVersion) + { + logger.log( + "CCUDAHandler: NVRTC runtime %d.%d is newer than CUDA driver API %d.%d. PTX generated by this NVRTC may be unsupported by the installed driver.", + system::ILogger::ELL_WARNING, + nvrtcVersion[0],nvrtcVersion[1], + cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion) + ); + } + if (runtimeEnvironment.includeDirs.empty()) + { + logger.log("CCUDAHandler: no CUDA runtime headers were discovered for NVRTC include paths.",system::ILogger::ELL_WARNING); + } + else + { + const auto& primaryIncludeDir = runtimeEnvironment.includeDirInfos.front(); + if (!primaryIncludeDir.completeRuntimeHeaderSet) + { + logger.log( + "CCUDAHandler: primary NVRTC runtime header path %s does not contain cuda.h, cuda_runtime_api.h, and vector_types.h together. NVRTC may use later include paths for missing headers.", + system::ILogger::ELL_WARNING, + primaryIncludeDir.path.generic_string().c_str() + ); + } + + const auto nvrtcVersionCode = cudaVersionCode(nvrtcVersion[0],nvrtcVersion[1]); + if (primaryIncludeDir.cudaVersion!=0u && primaryIncludeDir.cudaVersion!=static_cast(nvrtcVersionCode)) + { + logger.log( + "CCUDAHandler: primary NVRTC runtime headers report CUDA_VERSION %u (%s), while loaded NVRTC is %s. This is allowed by discovery policy, but kernels using version-specific CUDA headers may fail to compile.", + system::ILogger::ELL_WARNING, + primaryIncludeDir.cudaVersion, + cudaVersionString(primaryIncludeDir.cudaVersion).c_str(), + cudaVersionString(nvrtcVersion).c_str() + ); + } + } + // add headers core::vector> headers; for (const auto& it : jitify::detail::get_jitsafe_headers_map()) @@ -864,14 +1120,13 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste headers.push_back(core::make_smart_refctd_ptr>( it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, - // ASK(kevin): What initial_modified_time should I use? Is this how this parameter is used? std::chrono::clock_cast(std::chrono::system_clock::now()), const_cast(contents),it.second.size()+1u )); } return core::smart_refctd_ptr( - new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc)),std::move(headers),std::move(_logger)), + new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc),cudaVersion,nvrtcVersion,std::move(runtimeEnvironment)),std::move(headers),std::move(_logger)), core::dont_grab ); } @@ -927,25 +1182,19 @@ SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog) return {std::move(ptx),nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } -static const core::vector& getDefaultRuntimeIncludeOptions() -{ - static const auto RuntimeIncludeOptions = cuda_interop::makeNVRTCIncludeOptions(cuda_interop::findRuntimeCompileEnvironment()); - return RuntimeIncludeOptions; -} - static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string& log) { log.clear(); if (result!=NVRTC_SUCCESS) return {nullptr,result}; - const auto& runtimeIncludeOptions = getDefaultRuntimeIncludeOptions(); + const auto runtimeIncludeOptions = handler.getDefaultRuntimeIncludeOptions(); core::vector options; options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size()); for (const auto option : nvrtcOptions) options.push_back(option); - for (const auto& option : runtimeIncludeOptions) - options.push_back(option.c_str()); + for (const auto option : runtimeIncludeOptions) + options.push_back(option); const auto* optionsBegin = options.empty() ? nullptr:options.data(); const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); @@ -985,7 +1234,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct for (const auto& device : m_native->deviceStates) { - if (!memcmp(&device.info.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; @@ -1067,10 +1316,13 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::smart_refctd_ptr( - new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.info.handle),core::smart_refctd_ptr(this)), + auto cudaDevice = core::smart_refctd_ptr( + new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.handle),core::smart_refctd_ptr(this)), core::dont_grab ); + if (!cudaDevice->isValid()) + return nullptr; + return std::move(cudaDevice); } } return nullptr; @@ -1104,6 +1356,16 @@ uint32_t CCUDAHandler::getBuildCUDASDKVersion() return 0u; } +uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const +{ + return 0u; +} + +std::array CCUDAHandler::getLoadedNVRTCVersion() const +{ + return {-1,-1}; +} + const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const { std::abort(); @@ -1114,6 +1376,11 @@ const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const std::abort(); } +core::SRange CCUDAHandler::getDefaultRuntimeIncludeOptions() const +{ + return {nullptr,nullptr}; +} + core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) { return nullptr; diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 8ccad3e119..54a710e48c 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -20,31 +20,31 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const { - return cuda_native::SCUexternalMemory(m_native->handle); + return cuda_interop::SNativeHandle(m_native->handle); } bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const { - if (!mappedBuffer) - return false; - - CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; - bufferDesc.offset = 0; - bufferDesc.size = m_src->getAllocationSize(); - - CUdeviceptr nativeMappedBuffer = 0; - const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc); - if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result)) - return false; - - *mappedBuffer = cuda_native::SCUdeviceptr(nativeMappedBuffer); - return true; + if (!mappedBuffer) + return false; + + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; + bufferDesc.offset = 0; + bufferDesc.size = m_src->getAllocationSize(); + + CUdeviceptr nativeMappedBuffer = 0; + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc); + if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result)) + return false; + + *mappedBuffer = cuda_interop::SNativeHandle(nativeMappedBuffer); + return true; } CCUDAImportedMemory::~CCUDAImportedMemory() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)); } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index d495f979ab..1afd4a10b1 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -19,7 +19,7 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptrhandle); + return cuda_interop::SNativeHandle(m_native->handle); } CCUDAImportedSemaphore::~CCUDAImportedSemaphore() diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp index 3a1500e77e..04384336d1 100644 --- a/src/nbl/video/CUDAInteropNativeState.hpp +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -1,9 +1,10 @@ #ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ #define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ -#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" +#include "nbl/video/CUDAInteropNativeAPI.h" #include +#include namespace nbl::video { @@ -12,18 +13,33 @@ struct CCUDAHandler::SNativeState { struct SDeviceState { - cuda_native::SCUDADeviceInfo info = {}; + CUdevice handle = {}; + CUuuid uuid = {}; std::array attributes = {}; }; cuda_native::CUDA cuda; cuda_native::NVRTC nvrtc; - core::vector availableDevices; + int cudaDriverVersion = 0; + std::array nvrtcVersion = {-1,-1}; + // Snapshot discovery at handler creation so diagnostics and NVRTC compile options describe the same runtime setup. + cuda_interop::SRuntimeCompileEnvironment runtimeEnvironment; + core::vector runtimeIncludeOptions; + core::vector runtimeIncludeOptionPtrs; core::vector deviceStates; - SNativeState(cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc) + SNativeState( + cuda_native::CUDA&& _cuda, + cuda_native::NVRTC&& _nvrtc, + int _cudaDriverVersion, + std::array _nvrtcVersion, + cuda_interop::SRuntimeCompileEnvironment&& _runtimeEnvironment) : cuda(std::move(_cuda)) , nvrtc(std::move(_nvrtc)) + , cudaDriverVersion(_cudaDriverVersion) + , nvrtcVersion(_nvrtcVersion) + , runtimeEnvironment(std::move(_runtimeEnvironment)) + , runtimeIncludeOptions(cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment)) {} }; From f2f62ce5985f65b81f00ed95949f4180de0678d1 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 15:38:06 +0200 Subject: [PATCH 116/149] Polish CUDA interop review feedback --- examples_tests | 2 +- include/nbl/video/CCUDADevice.h | 2 +- include/nbl/video/CCUDAExportableMemory.h | 2 - include/nbl/video/CCUDAHandler.h | 10 +++-- include/nbl/video/CUDAInteropHandles.h | 2 + include/nbl/video/CUDAInteropNativeAPI.h | 16 ++++++++ include/nbl/video/EApiType.h | 1 - include/nbl/video/declarations.h | 1 + src/nbl/ext/CUDAInterop/README.md | 12 ++++-- src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt | 3 -- .../ext/CUDAInterop/smoke/clean_opt_in.cpp | 39 ------------------- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 8 ++-- .../ext/CUDAInterop/smoke/public_boundary.cpp | 22 +++-------- src/nbl/video/CCUDADevice.cpp | 4 +- src/nbl/video/CCUDAHandler.cpp | 34 +++++++++------- src/nbl/video/CUDAInteropNativeState.hpp | 4 +- 16 files changed, 70 insertions(+), 92 deletions(-) delete mode 100644 src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp diff --git a/examples_tests b/examples_tests index d373d313d3..a6268bc995 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit d373d313d3e70579d650c7804af8a2785cfede9a +Subproject commit a6268bc9953b8d8a795b3b2eee8dbd897b05706e diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 56e81d4b2f..57a8b5262a 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -87,7 +87,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - std::array m_allocationGranularity = {}; + std::array m_allocationGranularity = {}; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; bool m_valid = false; diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 510f483b3b..f1ae7f6031 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -19,8 +19,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted public: struct SCachedCreationParams { - size_t size; - uint32_t alignment; size_t granularSize; external_handle_t externalHandle; bool deviceLocal; diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index e69792b217..578d720546 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -23,6 +23,7 @@ class IPhysicalDevice; namespace cuda_native { +// SDK-free forward declarations for the dynamic CUDA/NVRTC tables exposed by the opt-in native header. class CUDA; class NVRTC; } @@ -30,6 +31,8 @@ class NVRTC; namespace cuda_interop { inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json"; +inline constexpr uint32_t RuntimeVersionComponentCount = 2u; +using SRuntimeVersion = std::array; struct SRuntimeIncludeDir { @@ -45,8 +48,9 @@ struct SRuntimeCompileEnvironment core::vector includeDirInfos; }; -NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs = {}); -NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs, const core::vector& runtimePathFiles); inline core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment) { core::vector options; @@ -62,7 +66,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); static uint32_t getBuildCUDASDKVersion(); uint32_t getLoadedCUDADriverVersion() const; - std::array getLoadedNVRTCVersion() const; + cuda_interop::SRuntimeVersion getLoadedNVRTCVersion() const; const cuda_native::CUDA& getCUDAFunctionTable() const; const cuda_native::NVRTC& getNVRTCFunctionTable() const; core::SRange getDefaultRuntimeIncludeOptions() const; diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index 987a130ad1..0b3cc9f488 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -11,6 +11,8 @@ namespace nbl::video::cuda_interop { +inline constexpr uint32_t AllocationGranularityLocationTypeCount = 5u; + /* SDK-free CUDA handle surrogates used by Nabla's public video API. diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h index 52dad41f09..37d8e0ec2d 100644 --- a/include/nbl/video/CUDAInteropNativeAPI.h +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -4,6 +4,7 @@ #ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ #define _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ +#include #include #include "nbl/video/CUDAInterop.h" @@ -19,6 +20,11 @@ namespace nbl::video::cuda_native inline constexpr int MinimumCUDADriverVersion = 13000; inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher."); +static_assert(CU_MEM_LOCATION_TYPE_INVALID==0); +static_assert(CU_MEM_LOCATION_TYPE_DEVICE==1); +static_assert(CU_MEM_LOCATION_TYPE_HOST==2); +static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA==3); +static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT+1==cuda_interop::AllocationGranularityLocationTypeCount); /* Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users. @@ -177,6 +183,16 @@ struct SPTXResult NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); + +// Opt-in convenience for examples/tests that intentionally assert on failures. Pass a CCUDAHandler reference. +// Nabla implementation code should prefer explicit error handling paths. +#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ + do { \ + const auto nblCudaInteropResult = (expr); \ + if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \ + assert(false); \ + } while (false) + NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 9b1a79e4d4..89be885b0f 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -1,7 +1,6 @@ #ifndef __NBL_E_API_TYPE_H_INCLUDED__ #define __NBL_E_API_TYPE_H_INCLUDED__ -#include "nbl/core/declarations.h" #include #ifdef _WIN32 #ifndef WIN32_LEAN_AND_MEAN diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index 4393af1768..1a74514714 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -36,6 +36,7 @@ #include "nbl/video/utilities/CSmoothResizeSurface.h" #include "nbl/video/utilities/CDefaultSwapchainFramebuffers.h" #include "nbl/video/utilities/CAssetConverter.h" +#include "nbl/video/CUDAInterop.h" //VT //#include "nbl/video/IGPUVirtualTexture.h" diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 5677db046f..dff708aff6 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -68,14 +68,18 @@ auto memory = cudaDevice->createExportableMemory({ .locationType = CU_MEM_LOCATION_TYPE_DEVICE, }); +auto& cu = handler->getCUDAFunctionTable(); +auto& nvrtc = handler->getNVRTCFunctionTable(); +int driverVersion = 0; +NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), *handler); + nbl::video::cuda_native::SCUdeviceptr mapped; if (importedMemory) importedMemory->getMappedBuffer(mapped); +nbl::video::cuda_native::SCUdeviceptr exported = memory->getDeviceptr(); CUdeviceptr rawMapped = mapped; -CUdeviceptr rawExported = nbl::video::cuda_native::SCUdeviceptr(memory->getDeviceptr()); -auto& cu = handler->getCUDAFunctionTable(); -auto& nvrtc = handler->getNVRTCFunctionTable(); +CUdeviceptr rawExported = exported; std::string log; auto compile = nbl::video::cuda_native::compileDirectlyToPTX( @@ -102,12 +106,12 @@ if (pcuNewCall) - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. - `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr` directly and then use it as `CUdeviceptr`. - `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. +- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for tests/examples that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns. - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. Smoke examples: - `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free. -- `src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp` checks `Nabla::Nabla` package usage without SDK opt-in. - `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, extra dynamic symbol loading, and raw interop usage. ## ABI diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt index 7118eeff09..e16d3feac0 100644 --- a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -24,9 +24,6 @@ endfunction() nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp) target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla) -nbl_add_cuda_interop_smoke(NblExtCUDAInteropCleanNablaSmoke clean_opt_in.cpp) -target_link_libraries(NblExtCUDAInteropCleanNablaSmoke PRIVATE Nabla::Nabla) - if(TARGET Nabla::ext::CUDAInterop) nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp) set(_nbl_cuda_interop_smoke_args PRIVATE) diff --git a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp deleted file mode 100644 index 31bf461804..0000000000 --- a/src/nbl/ext/CUDAInterop/smoke/clean_opt_in.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include "nbl/video/CUDAInterop.h" -#include "nbl/system/IApplicationFramework.h" - -#include - -#ifdef _NBL_COMPILE_WITH_CUDA_ -#error "Nabla::Nabla must not propagate the CUDA build define." -#endif - -#ifdef CUDA_VERSION -#error "Nabla::Nabla must not require CUDA SDK headers." -#endif - -namespace -{ - -class CUDAInteropCleanOptInSmoke final : public nbl::system::IApplicationFramework -{ - using base_t = nbl::system::IApplicationFramework; - -public: - using base_t::base_t; - - bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override - { - static_assert(std::is_class_v); - static_assert(std::is_class_v); - static_assert(std::is_class_v); - static_assert(std::is_class_v); - return isAPILoaded(); - } - - void workLoopBody() override {} - bool keepRunning() override { return false; } -}; - -} - -NBL_MAIN_FUNC(CUDAInteropCleanOptInSmoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 79e85555b7..416b829fb1 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -39,8 +39,8 @@ using namespace nbl::video; if (importedFromVulkan) importedFromVulkan->getMappedBuffer(mappedVulkanMemory); - const CUdeviceptr cudaDevicePtr = cuda_native::SCUdeviceptr(cudaMemory->getDeviceptr()); - CUexternalSemaphore cudaSemaphore = nullptr; + const cuda_native::SCUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr(); + cuda_native::SCUexternalSemaphore cudaSemaphore; if (importedSemaphore) cudaSemaphore = cuda_native::SCUexternalSemaphore(importedSemaphore->getInternalObject()); return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; @@ -130,7 +130,9 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew [[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON - const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment({}, {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}); + const nbl::core::vector explicitIncludeDirs; + const nbl::core::vector runtimePathFiles = {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}; + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(explicitIncludeDirs, runtimePathFiles); if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON)) return false; #else diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp index dc1c247806..73307599b1 100644 --- a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -1,24 +1,8 @@ #include "nabla.h" -#ifdef _NBL_COMPILE_WITH_CUDA_ -#error "Nabla consumers must not get the CUDA opt-in define." -#endif - -#ifdef CUDA_VERSION -#error "Nabla consumers must not include CUDA SDK headers." -#endif - #include "nbl/system/IApplicationFramework.h" - -#ifdef _NBL_COMPILE_WITH_CUDA_ -#error "Nabla consumers must not get the CUDA opt-in define." -#endif - -#ifdef CUDA_VERSION -#error "Nabla consumers must not include CUDA SDK headers." -#endif - #include "nbl/video/CUDAInterop.h" +#include #ifdef _NBL_COMPILE_WITH_CUDA_ #error "Nabla consumers must not get the CUDA opt-in define." @@ -40,6 +24,10 @@ class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFra bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override { + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); return isAPILoaded(); } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 0178f31fc7..29a6562640 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -167,8 +167,6 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor const auto location = static_cast(inParams.locationType); CCUDAExportableMemory::SCachedCreationParams params = { - .size = inParams.size, - .alignment = inParams.alignment, .granularSize = roundToGranularity(inParams.locationType, inParams.size), .deviceLocal = isDeviceLocal(location) }; @@ -208,7 +206,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor return nullptr; } - if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, params.alignment, location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, inParams.alignment, location, mem); CUDA_SUCCESS != err) { m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 6d8b2ffb70..9305cf83c0 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -299,7 +299,7 @@ void appendSystemIncludeDirs(SRuntimeCompileEnvironment& environment) } -SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs, core::vector runtimePathFiles) +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs, const core::vector& runtimePathFiles) { SRuntimeCompileEnvironment environment; @@ -314,8 +314,8 @@ SRuntimeCompileEnvironment findRuntimeCompileEnvironment(core::vector explicitIncludeDirs) +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs) { - return findRuntimeCompileEnvironment(std::move(explicitIncludeDirs),{}); + static const core::vector EmptyRuntimePathFiles; + return findRuntimeCompileEnvironment(explicitIncludeDirs,EmptyRuntimePathFiles); +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment() +{ + static const core::vector EmptyIncludeDirs; + static const core::vector EmptyRuntimePathFiles; + return findRuntimeCompileEnvironment(EmptyIncludeDirs,EmptyRuntimePathFiles); } } @@ -378,7 +386,7 @@ std::string cudaVersionString(int version) return stream.str(); } -std::string cudaVersionString(const std::array& version) +std::string cudaVersionString(const cuda_interop::SRuntimeVersion& version) { std::ostringstream stream; stream << version[0] << "." << version[1]; @@ -401,7 +409,7 @@ std::string runtimeIncludeDirDescription(const cuda_interop::SRuntimeIncludeDir& std::string cudaRuntimeReport( const int buildVersion, const int cudaDriverVersion, const system::path& cudaDriverPath, - const std::array& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath, + const cuda_interop::SRuntimeVersion& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath, const cuda_interop::SRuntimeCompileEnvironment& runtimeEnvironment) { std::ostringstream stream; @@ -489,7 +497,7 @@ uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const return m_native->cudaDriverVersion; } -std::array CCUDAHandler::getLoadedNVRTCVersion() const +cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const { return m_native->nvrtcVersion; } @@ -970,7 +978,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste // stop the pollution #undef SAFE_CUDA_CALL - auto readNVRTCVersion = [&](const cuda_native::NVRTC& candidate, std::array& version, const char* name) -> bool + auto readNVRTCVersion = [&](const cuda_native::NVRTC& candidate, cuda_interop::SRuntimeVersion& version, const char* name) -> bool { if (!candidate.pnvrtcVersion) return false; @@ -985,12 +993,12 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste }; cuda_native::NVRTC nvrtc = {}; - std::array nvrtcVersion = {-1,-1}; + cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1}; std::string nvrtcLibraryName; #if defined(_NBL_WINDOWS_API_) cuda_native::NVRTC fallbackNVRTC = {}; - std::array fallbackNVRTCVersion = {-1,-1}; + cuda_interop::SRuntimeVersion fallbackNVRTCVersion = {-1,-1}; std::string fallbackNVRTCLibraryName; /* @@ -1016,7 +1024,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste candidateName += *suffix; cuda_native::NVRTC candidate(candidateName.c_str()); - std::array candidateVersion = {-1,-1}; + cuda_interop::SRuntimeVersion candidateVersion = {-1,-1}; if (!readNVRTCVersion(candidate,candidateVersion,candidateName.c_str())) continue; @@ -1361,7 +1369,7 @@ uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const return 0u; } -std::array CCUDAHandler::getLoadedNVRTCVersion() const +cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const { return {-1,-1}; } diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp index 04384336d1..04a70c6e4e 100644 --- a/src/nbl/video/CUDAInteropNativeState.hpp +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -21,7 +21,7 @@ struct CCUDAHandler::SNativeState cuda_native::CUDA cuda; cuda_native::NVRTC nvrtc; int cudaDriverVersion = 0; - std::array nvrtcVersion = {-1,-1}; + cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1}; // Snapshot discovery at handler creation so diagnostics and NVRTC compile options describe the same runtime setup. cuda_interop::SRuntimeCompileEnvironment runtimeEnvironment; core::vector runtimeIncludeOptions; @@ -32,7 +32,7 @@ struct CCUDAHandler::SNativeState cuda_native::CUDA&& _cuda, cuda_native::NVRTC&& _nvrtc, int _cudaDriverVersion, - std::array _nvrtcVersion, + cuda_interop::SRuntimeVersion _nvrtcVersion, cuda_interop::SRuntimeCompileEnvironment&& _runtimeEnvironment) : cuda(std::move(_cuda)) , nvrtc(std::move(_nvrtc)) From 9c504a14a63527f01cdf324672d7ac8c47e86749 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 17:11:06 +0200 Subject: [PATCH 117/149] Polish CUDA interop native header --- include/nbl/video/CCUDADevice.h | 3 ++- include/nbl/video/CUDAInteropHandles.h | 2 -- include/nbl/video/CUDAInteropNativeAPI.h | 22 +++++++--------------- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 57a8b5262a..d6a1378dcb 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -80,6 +80,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted private: friend class CCUDAHandler; + static constexpr uint32_t AllocationGranularityLocationTypeCount = 5u; struct SNativeState; CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); bool isValid() const; @@ -87,7 +88,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - std::array m_allocationGranularity = {}; + std::array m_allocationGranularity = {}; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; bool m_valid = false; diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index 0b3cc9f488..987a130ad1 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -11,8 +11,6 @@ namespace nbl::video::cuda_interop { -inline constexpr uint32_t AllocationGranularityLocationTypeCount = 5u; - /* SDK-free CUDA handle surrogates used by Nabla's public video API. diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h index 37d8e0ec2d..eb75f0eec3 100644 --- a/include/nbl/video/CUDAInteropNativeAPI.h +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -20,11 +20,6 @@ namespace nbl::video::cuda_native inline constexpr int MinimumCUDADriverVersion = 13000; inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher."); -static_assert(CU_MEM_LOCATION_TYPE_INVALID==0); -static_assert(CU_MEM_LOCATION_TYPE_DEVICE==1); -static_assert(CU_MEM_LOCATION_TYPE_HOST==2); -static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA==3); -static_assert(CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT+1==cuda_interop::AllocationGranularityLocationTypeCount); /* Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users. @@ -183,16 +178,6 @@ struct SPTXResult NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); - -// Opt-in convenience for examples/tests that intentionally assert on failures. Pass a CCUDAHandler reference. -// Nabla implementation code should prefer explicit error handling paths. -#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ - do { \ - const auto nblCudaInteropResult = (expr); \ - if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \ - assert(false); \ - } while (false) - NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); @@ -202,6 +187,13 @@ NBL_API2 SPTXResult compileDirectlyToPTX( std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr ); +#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ + do { \ + const auto nblCudaInteropResult = (expr); \ + if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \ + assert(false); \ + } while (false) + } #endif From 0df750788774a7c8da94a9b4d14a649d3f7b4761 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 19:42:13 +0200 Subject: [PATCH 118/149] Use opaque CUDA interop handles --- examples_tests | 2 +- .../nbl/ext/CUDAInterop/CUDAInteropNative.h | 25 +--- include/nbl/video/CCUDAHandler.h | 32 +++++ include/nbl/video/CCUDAImportedMemory.h | 17 +++ include/nbl/video/CUDAInteropHandles.h | 113 +++++++++++++----- include/nbl/video/CUDAInteropNativeAPI.h | 57 ++++----- src/nbl/ext/CUDAInterop/README.md | 23 ++-- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 14 +-- src/nbl/video/CCUDADevice.cpp | 28 ++--- src/nbl/video/CCUDAExportableMemory.cpp | 6 +- src/nbl/video/CCUDAHandler.cpp | 92 +++++++------- src/nbl/video/CCUDAImportedMemory.cpp | 8 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 4 +- 13 files changed, 249 insertions(+), 172 deletions(-) diff --git a/examples_tests b/examples_tests index a6268bc995..eb8f44a1b5 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit a6268bc9953b8d8a795b3b2eee8dbd897b05706e +Subproject commit eb8f44a1b5ef38d1416a6fdc9a43e8e0215ec0bf diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h index 538645ce3d..ea360d785a 100644 --- a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -40,30 +40,17 @@ namespace nbl::video::cuda_native { /* - CUDA SDK view of an SDK-free opaque handle. + This header specializes the SDK-free opaque handles from nbl/video/CUDAInteropHandles.h for the CUDA SDK + visible to this translation unit. After that opt-in, Nabla interop methods can be called with native CUDA/NVRTC + types such as CUdeviceptr, CUexternalSemaphore, nvrtcProgram, CUresult, and nvrtcResult. - The conversions are intentionally available only after including this header. Public Nabla headers expose - only the opaque SCU* values. Once a consumer opts in, the aliases below restore the CUDA spelling and - ergonomics for raw Driver API calls without adding accessors to every interop operation. Each alias maps one - Nabla opaque handle to the matching CUDA SDK handle and validates size/alignment against the SDK selected by - this opt-in translation unit. -*/ -using SCUdevice = cuda_interop::SNativeHandle; -using SCUcontext = cuda_interop::SNativeHandle; -using SCUdeviceptr = cuda_interop::SNativeHandle; -using SCUexternalMemory = cuda_interop::SNativeHandle; -using SCUexternalSemaphore = cuda_interop::SNativeHandle; - -/* - Check whether this opt-in translation unit uses the exact CUDA SDK version that was used to build Nabla's - CUDA interop implementation. Opaque handle layout is checked by SNativeHandle aliases above. This exact - version check is a policy helper for SDK-typed code that wants to warn about or reject compatible-but-different - SDK headers. + The size/alignment checks live in nbl/video/CUDAInteropNativeAPI.h. This exact version check is a policy helper + for SDK-typed code that wants to warn about or reject compatible-but-different SDK headers. */ inline bool isBuildCUDASDKVersionExactMatch() { const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion(); - return buildVersion==0u || buildVersion==CUDA_VERSION; + return buildVersion==CUDA_VERSION; } } diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 578d720546..241f59ea5b 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -7,13 +7,16 @@ #include "nbl/core/declarations.h" #include "nbl/core/definitions.h" +#include "nbl/asset/ICPUBuffer.h" #include "nbl/system/declarations.h" #include "nbl/system/path.h" +#include "nbl/video/CUDAInteropHandles.h" #include #include #include #include +#include namespace nbl::video { @@ -72,6 +75,35 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted core::SRange getDefaultRuntimeIncludeOptions() const; inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); } + struct SPTXResult + { + core::smart_refctd_ptr ptx; + cuda_interop::SNVRTCResult result; + }; + + static bool defaultHandleResult(cuda_interop::SCUresult result, const system::logger_opt_ptr& logger); + bool defaultHandleResult(cuda_interop::SCUresult result) const; + bool defaultHandleResult(cuda_interop::SNVRTCResult result) const; + + cuda_interop::SNVRTCResult createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); + template + requires (!std::is_same_v,cuda_interop::SNVRTCProgram>) + cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) + { + cuda_interop::SNVRTCProgram opaqueProgram = {}; + const auto result = createProgram(&opaqueProgram,std::move(source),name,headerCount,headerContents,includeNames); + if (prog) + *prog = static_cast(opaqueProgram); + return result; + } + cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const; + cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const; + SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const; + SPTXResult compileDirectlyToPTX( + std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr + ); + inline core::SRange getSTDHeaders() { auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get()); diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index e2c9bb6db6..5cdb1bb3f6 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -5,6 +5,7 @@ #include "nbl/video/CUDAInteropHandles.h" #include +#include #include namespace nbl::video @@ -19,6 +20,22 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted cuda_interop::SCUexternalMemory getInternalObject() const; bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const; bool getMappedBuffer(cuda_interop::SCUdeviceptr& mappedBuffer) const { return getMappedBuffer(&mappedBuffer); } + template + requires (!std::is_same_v,cuda_interop::SCUdeviceptr>) + bool getMappedBuffer(DevicePtr* mappedBuffer) const + { + cuda_interop::SCUdeviceptr opaqueMappedBuffer = {}; + const auto result = getMappedBuffer(&opaqueMappedBuffer); + if (result && mappedBuffer) + *mappedBuffer = static_cast(opaqueMappedBuffer); + return result; + } + template + requires (!std::is_same_v,cuda_interop::SCUdeviceptr>) + bool getMappedBuffer(DevicePtr& mappedBuffer) const + { + return getMappedBuffer(&mappedBuffer); + } private: friend class CCUDADevice; diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index 987a130ad1..b9e5be244b 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -4,8 +4,10 @@ #ifndef _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ #define _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ +#include #include #include +#include #include namespace nbl::video::cuda_interop @@ -19,17 +21,8 @@ namespace nbl::video::cuda_interop not inherit CUDA SDK as a public compile-time dependency. CUDAInteropNative.h maps these opaque handles back to the real CU* types and checks their size/alignment against the SDK selected by the opt-in consumer. */ -template -struct alignas(alignof(Storage)) SOpaqueCUDAHandle -{ - uint8_t value[sizeof(Storage)] = {}; -}; - -struct SCUdevice : SOpaqueCUDAHandle {}; -struct SCUcontext : SOpaqueCUDAHandle {}; -struct SCUdeviceptr : SOpaqueCUDAHandle {}; -struct SCUexternalMemory : SOpaqueCUDAHandle {}; -struct SCUexternalSemaphore : SOpaqueCUDAHandle {}; +template +struct SOpaqueCUDANativeType; template concept cuda_opaque_handle = @@ -38,36 +31,90 @@ concept cuda_opaque_handle = sizeof(Opaque)==sizeof(Native) && alignof(Opaque)==alignof(Native); -/* - Native view of an SDK-free opaque handle. - - This template does not depend on CUDA SDK types by itself. CUDAInteropNative.h binds it to concrete CU* types - after the consumer opts into CUDA SDK headers. The layout check keeps the public opaque handle and the native - SDK handle compatible in that translation unit while preserving Nabla's SDK-free public headers. -*/ template -struct SNativeHandle +concept cuda_native_handle_for = + requires { typename SOpaqueCUDANativeType::type; } && + std::same_as,typename SOpaqueCUDANativeType::type> && + cuda_opaque_handle>; + +template +struct alignas(alignof(Storage)) SOpaqueCUDAHandle { - using cuda_t = Native; - static_assert(cuda_opaque_handle); + uint8_t value[sizeof(Storage)] = {}; + + SOpaqueCUDAHandle() = default; + + template + requires cuda_native_handle_for + SOpaqueCUDAHandle(const Native& native) + { + operator=(native); + } + + template + requires cuda_native_handle_for + Derived& operator=(const Native& native) + { + std::memcpy(value,&native,sizeof(native)); + return static_cast(*this); + } - SNativeHandle() = default; - SNativeHandle(const SNativeHandle&) = default; - SNativeHandle(const cuda_t& native) { operator=(native); } - SNativeHandle(const Opaque& opaque) { operator=(opaque); } + template + requires cuda_native_handle_for + operator Native() const + { + Native native = {}; + std::memcpy(&native,value,sizeof(native)); + return native; + } - SNativeHandle& operator=(const SNativeHandle&) = default; - SNativeHandle& operator=(const cuda_t& native) { value = native; return *this; } - SNativeHandle& operator=(const Opaque& opaque) { operator Opaque&() = opaque; return *this; } + template + requires cuda_native_handle_for + friend bool operator==(const Derived& lhs, const Native& rhs) + { + return static_cast(lhs)==rhs; + } - operator cuda_t&() { return value; } - operator const cuda_t&() const { return value; } - operator Opaque&() { return reinterpret_cast(value); } - operator const Opaque&() const { return reinterpret_cast(value); } + template + requires cuda_native_handle_for + friend bool operator==(const Native& lhs, const Derived& rhs) + { + return lhs==static_cast(rhs); + } - cuda_t value = {}; + template + requires cuda_native_handle_for + friend bool operator!=(const Derived& lhs, const Native& rhs) + { + return !(lhs==rhs); + } + + template + requires cuda_native_handle_for + friend bool operator!=(const Native& lhs, const Derived& rhs) + { + return !(lhs==rhs); + } }; +#define NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(NAME, STORAGE) \ + struct NAME : SOpaqueCUDAHandle \ + { \ + using SOpaqueCUDAHandle::SOpaqueCUDAHandle; \ + using SOpaqueCUDAHandle::operator=; \ + } + +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdevice, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUcontext, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdeviceptr, uintptr_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalMemory, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalSemaphore, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUresult, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCResult, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCProgram, void*); + +#undef NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE + } #endif diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h index eb75f0eec3..d61ce32b67 100644 --- a/include/nbl/video/CUDAInteropNativeAPI.h +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -8,12 +8,34 @@ #include #include "nbl/video/CUDAInterop.h" -#include "nbl/asset/ICPUBuffer.h" #include "nbl/system/DynamicFunctionCaller.h" #include "cuda.h" #include "nvrtc.h" +namespace nbl::video::cuda_interop +{ + +template<> struct SOpaqueCUDANativeType { using type = CUdevice; }; +template<> struct SOpaqueCUDANativeType { using type = CUcontext; }; +template<> struct SOpaqueCUDANativeType { using type = CUdeviceptr; }; +template<> struct SOpaqueCUDANativeType { using type = CUexternalMemory; }; +template<> struct SOpaqueCUDANativeType { using type = CUexternalSemaphore; }; +template<> struct SOpaqueCUDANativeType { using type = CUresult; }; +template<> struct SOpaqueCUDANativeType { using type = nvrtcResult; }; +template<> struct SOpaqueCUDANativeType { using type = nvrtcProgram; }; + +static_assert(cuda_opaque_handle); +static_assert(cuda_opaque_handle); +static_assert(cuda_opaque_handle); +static_assert(cuda_opaque_handle); +static_assert(cuda_opaque_handle); +static_assert(cuda_opaque_handle); +static_assert(cuda_opaque_handle); +static_assert(cuda_opaque_handle); + +} + namespace nbl::video::cuda_native { @@ -30,9 +52,8 @@ static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or h do not include it, so normal Nabla consumers do not need cuda.h or nvrtc.h. The declarations below intentionally use CUDA/NVRTC SDK types because they describe the SDK-typed glue between - raw CUDA code and Nabla's exported CUDA interop objects: dynamic function tables, NVRTC helpers, error handling, - and runtime header discovery integration. Consumers enter this surface only by linking Nabla::ext::CUDAInterop - and including nbl/ext/CUDAInterop/CUDAInteropNative.h. + raw CUDA code and Nabla's exported CUDA interop objects. Consumers enter this surface only by linking + Nabla::ext::CUDAInterop and including nbl/ext/CUDAInterop/CUDAInteropNative.h. */ using LibLoader = system::DefaultFuncPtrLoader; @@ -161,36 +182,10 @@ NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, nvrtcGetProgramLogSize ); -struct SPTXResult -{ - core::smart_refctd_ptr ptx; - nvrtcResult result; -}; - -/* - Exported Nabla glue declarations with CUDA SDK signatures. - - These are not a CUDA wrapper. They are the small boundary surface used for error handling, NVRTC helpers, - runtime header discovery integration, and dynamic CUDA/NVRTC table access. Nabla owns the definitions. - The signatures mention CUDA SDK types, so they are intentionally unavailable to consumers that only parse - SDK-free nbl/video/CCUDA*.h headers. -*/ -NBL_API2 bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); -NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, CUresult result); -NBL_API2 bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result); -NBL_API2 nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); -NBL_API2 nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options); -NBL_API2 nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log); -NBL_API2 SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog); -NBL_API2 SPTXResult compileDirectlyToPTX( - CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, - std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr -); - #define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ do { \ const auto nblCudaInteropResult = (expr); \ - if (!::nbl::video::cuda_native::defaultHandleResult((handler),nblCudaInteropResult)) \ + if (!(handler).defaultHandleResult(nblCudaInteropResult)) \ assert(false); \ } while (false) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index dff708aff6..55db5cbd24 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -6,8 +6,8 @@ - The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers. - CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`. - `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. -- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and exported Nabla glue functions whose signatures use CUDA SDK types. -- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header and aliases Nabla opaque handles to CUDA SDK types through `cuda_interop::SNativeHandle`. +- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and binds SDK-free opaque handles to CUDA/NVRTC SDK types. +- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header so SDK-typed code can use CUDA/NVRTC handles directly with Nabla interop methods. ## CMake Usage @@ -73,17 +73,14 @@ auto& nvrtc = handler->getNVRTCFunctionTable(); int driverVersion = 0; NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), *handler); -nbl::video::cuda_native::SCUdeviceptr mapped; +CUdeviceptr mapped = 0; if (importedMemory) importedMemory->getMappedBuffer(mapped); -nbl::video::cuda_native::SCUdeviceptr exported = memory->getDeviceptr(); -CUdeviceptr rawMapped = mapped; -CUdeviceptr rawExported = exported; +CUdeviceptr exported = memory->getDeviceptr(); std::string log; -auto compile = nbl::video::cuda_native::compileDirectlyToPTX( - *handler, +auto compile = handler->compileDirectlyToPTX( std::move(cudaSource), "kernel.cu", cudaDevice->geDefaultCompileOptions(), @@ -102,11 +99,11 @@ if (pcuNewCall) pcuNewCall(...); ``` -- `cuda_interop::SNativeHandle` converts between SDK-free Nabla opaque handles and CUDA SDK handles such as `CUdeviceptr`. The template itself is SDK-free. `CUDAInteropNative.h` only provides CUDA-typed aliases. +- `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`. - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. -- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr`. SDK opt-in code can pass `cuda_native::SCUdeviceptr` directly and then use it as `CUdeviceptr`. -- `compileDirectlyToPTX` returns PTX/result and writes the NVRTC log to a required `std::string&`. -- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for tests/examples that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns. +- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr` in SDK-free code. SDK opt-in code can pass `CUdeviceptr` directly. +- `CCUDAHandler::createProgram`, `compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can call them with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`. +- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns. - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. Smoke examples: @@ -138,7 +135,7 @@ NVRTC may need CUDA runtime headers when user kernels include files such as `cud - The first include root is not required to match the SDK used to build Nabla. It is the first `-I` path visible to NVRTC, so the first path containing a requested header wins just like normal C/C++ include search. - If the primary runtime header root is incomplete or reports a different CUDA version than the loaded NVRTC runtime, Nabla logs a warning. This is diagnostic policy, not an automatic hard failure. - The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. -- `CCUDAHandler` captures discovered include directories when it is created. `cuda_native::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler. +- `CCUDAHandler` captures discovered include directories when it is created. `CCUDAHandler::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler. Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 416b829fb1..c2f9a97ac4 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -35,14 +35,14 @@ using namespace nbl::video; auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory)); auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore)); - cuda_native::SCUdeviceptr mappedVulkanMemory; + CUdeviceptr mappedVulkanMemory = 0; if (importedFromVulkan) importedFromVulkan->getMappedBuffer(mappedVulkanMemory); - const cuda_native::SCUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr(); - cuda_native::SCUexternalSemaphore cudaSemaphore; + const CUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr(); + CUexternalSemaphore cudaSemaphore = nullptr; if (importedSemaphore) - cudaSemaphore = cuda_native::SCUexternalSemaphore(importedSemaphore->getInternalObject()); + cudaSemaphore = importedSemaphore->getInternalObject(); return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; } @@ -100,8 +100,7 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) )cuda"; std::string log; - auto compile = cuda_native::compileDirectlyToPTX( - handler, + auto compile = handler.compileDirectlyToPTX( std::string(Source), "cuda_fp16_discovery_probe.cu", {nullptr,nullptr}, @@ -126,7 +125,8 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!isAPILoaded()) return false; - static_assert(std::is_same_v); + static_assert(nbl::video::cuda_interop::cuda_opaque_handle); + static_assert(nbl::video::cuda_interop::cuda_opaque_handle); [[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 29a6562640..25caa0162b 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -88,9 +88,9 @@ CCUDADevice::CCUDADevice( const auto& cu = m_handler->getCUDAFunctionTable(); - if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) + if (!m_handler->defaultHandleResult(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) return; - if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuCtxSetCurrent(m_native->context))) + if (!m_handler->defaultHandleResult(cu.pcuCtxSetCurrent(m_native->context))) return; for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) @@ -109,7 +109,7 @@ CCUDADevice::CCUDADevice( .win32HandleMetaData = &metadata, #endif }; - if (!cuda_native::defaultHandleResult(*m_handler, cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) + if (!m_handler->defaultHandleResult(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) return; } m_valid = true; @@ -117,12 +117,12 @@ CCUDADevice::CCUDADevice( cuda_interop::SCUdevice CCUDADevice::getInternalObject() const { - return cuda_interop::SNativeHandle(m_native->handle); + return m_native->handle; } cuda_interop::SCUcontext CCUDADevice::getContext() const { - return cuda_interop::SNativeHandle(m_native->context); + return m_native->context; } static bool isDeviceLocal(CUmemLocationType location) @@ -140,7 +140,7 @@ static CUresult reserveAddressAndMapMemory(const CCUDAHandler& handler, CUdevice if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size)); + handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size)); return err; } @@ -151,8 +151,8 @@ static CUresult reserveAddressAndMapMemory(const CCUDAHandler& handler, CUdevice if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - cuda_native::defaultHandleResult(handler, cu.pcuMemUnmap(ptr, size)); - cuda_native::defaultHandleResult(handler, cu.pcuMemAddressFree(ptr, size)); + handler.defaultHandleResult(cu.pcuMemUnmap(ptr, size)); + handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size)); return err; } @@ -202,7 +202,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); - cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)); + handler->defaultHandleResult(cu.pcuMemRelease(mem)); return nullptr; } @@ -210,7 +210,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor { m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - cuda_native::defaultHandleResult(*handler, cu.pcuMemRelease(mem)); + handler->defaultHandleResult(cu.pcuMemRelease(mem)); if (!CloseExternalHandle(params.externalHandle)) m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); @@ -220,9 +220,9 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - cuda_native::defaultHandleResult(*handler, err); - cuda_native::defaultHandleResult(*handler, cu.pcuMemUnmap(nativeState->ptr, params.granularSize)); - cuda_native::defaultHandleResult(*handler, cu.pcuMemAddressFree(nativeState->ptr, params.granularSize)); + handler->defaultHandleResult(err); + handler->defaultHandleResult(cu.pcuMemUnmap(nativeState->ptr, params.granularSize)); + handler->defaultHandleResult(cu.pcuMemAddressFree(nativeState->ptr, params.granularSize)); if (!CloseExternalHandle(params.externalHandle)) m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; @@ -297,7 +297,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CCUDADevice::~CCUDADevice() { if (m_native->context) - cuda_native::defaultHandleResult(*m_handler, m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)); + m_handler->defaultHandleResult(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)); } bool CCUDADevice::isValid() const diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 722c958b68..9333a39f54 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -54,9 +54,9 @@ CCUDAExportableMemory::~CCUDAExportableMemory() { const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)); + m_device->getHandler()->defaultHandleResult(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)); - cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)); + m_device->getHandler()->defaultHandleResult(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)); if (!CloseExternalHandle(m_params.externalHandle)) m_device->getHandler()->getLogger().log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); @@ -65,7 +65,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory() cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const { - return cuda_interop::SNativeHandle(m_native->ptr); + return m_native->ptr; } } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 9305cf83c0..094046ea6c 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -520,11 +520,9 @@ core::SRange CCUDAHandler::getDefaultRuntimeIncludeOptions() return {begin,begin+m_native->runtimeIncludeOptionPtrs.size()}; } -namespace cuda_native -{ - -bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) +bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult, const system::logger_opt_ptr& logger) { + const CUresult result = opaqueResult; switch (result) { case CUDA_SUCCESS: @@ -893,26 +891,28 @@ bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) return false; } -bool defaultHandleResult(const CCUDAHandler& handler, CUresult result) +bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult) const { + const CUresult result = opaqueResult; if (result==CUDA_ERROR_UNSUPPORTED_PTX_VERSION) { - const auto cudaVersion = handler.getLoadedCUDADriverVersion(); - const auto nvrtcVersion = handler.getLoadedNVRTCVersion(); - handler.getLogger().log( + const auto cudaVersion = getLoadedCUDADriverVersion(); + const auto nvrtcVersion = getLoadedNVRTCVersion(); + getLogger().log( "CCUDAHandler: CUDA driver API %d.%d rejected PTX produced through NVRTC %d.%d. Install a newer NVIDIA driver or use an NVRTC/runtime-header set compatible with the installed driver.", system::ILogger::ELL_ERROR, cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion), nvrtcVersion[0],nvrtcVersion[1] ); } - return defaultHandleResult(result,handler.getLogger()); + return defaultHandleResult(opaqueResult,getLogger()); } -bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) +bool CCUDAHandler::defaultHandleResult(cuda_interop::SNVRTCResult opaqueResult) const { - const auto& nvrtc = handler.getNVRTCFunctionTable(); - const auto logger = handler.getLogger(); + const nvrtcResult result = opaqueResult; + const auto& nvrtc = getNVRTCFunctionTable(); + const auto logger = getLogger(); switch (result) { case NVRTC_SUCCESS: @@ -928,8 +928,6 @@ bool defaultHandleResult(const CCUDAHandler& handler, nvrtcResult result) return false; } -} - core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr&& _logger) { const system::logger_opt_ptr logger(_logger.get()); @@ -1139,10 +1137,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste ); } -namespace cuda_native -{ - -nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -1151,33 +1146,40 @@ nvrtcResult createProgram(CCUDAHandler& handler, nvrtcProgram* prog, std::string #else #error "Unsuported Platform" #endif - return handler.getNVRTCFunctionTable().pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); + nvrtcProgram nativeProgram = nullptr; + const auto result = getNVRTCFunctionTable().pnvrtcCreateProgram(&nativeProgram,source.c_str(),name,headerCount,headerContents,includeNames); + if (prog) + *prog = nativeProgram; + return result; } -nvrtcResult compileProgram(const CCUDAHandler& handler, nvrtcProgram prog, core::SRange options) +cuda_interop::SNVRTCResult CCUDAHandler::compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const { - return handler.getNVRTCFunctionTable().pnvrtcCompileProgram(prog,options.size(),options.begin()); + const nvrtcProgram nativeProgram = prog; + return getNVRTCFunctionTable().pnvrtcCompileProgram(nativeProgram,options.size(),options.begin()); } -nvrtcResult getProgramLog(const CCUDAHandler& handler, nvrtcProgram prog, std::string& log) +cuda_interop::SNVRTCResult CCUDAHandler::getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const { size_t _size = 0ull; - const auto& nvrtc = handler.getNVRTCFunctionTable(); - nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(prog, &_size); + const nvrtcProgram nativeProgram = prog; + const auto& nvrtc = getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(nativeProgram, &_size); if (sizeRes != NVRTC_SUCCESS) return sizeRes; if (_size == 0ull) return NVRTC_ERROR_INVALID_INPUT; log.resize(_size); - return nvrtc.pnvrtcGetProgramLog(prog,log.data()); + return nvrtc.pnvrtcGetProgramLog(nativeProgram,log.data()); } -SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog) +CCUDAHandler::SPTXResult CCUDAHandler::getPTX(cuda_interop::SNVRTCProgram prog) const { size_t _size = 0ull; - const auto& nvrtc = handler.getNVRTCFunctionTable(); - nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(prog,&_size); + const nvrtcProgram nativeProgram = prog; + const auto& nvrtc = getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(nativeProgram,&_size); if (sizeRes!=NVRTC_SUCCESS) return {nullptr,sizeRes}; if (_size==0ull) @@ -1187,13 +1189,14 @@ SPTXResult getPTX(const CCUDAHandler& handler, nvrtcProgram prog) ptxParams.size = _size; auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); auto ptxPtr = static_cast(ptx->getPointer()); - return {std::move(ptx),nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; + return {std::move(ptx),nvrtc.pnvrtcGetPTX(nativeProgram,ptxPtr)}; } -static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string& log) +static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange nvrtcOptions, std::string& log) { log.clear(); - if (result!=NVRTC_SUCCESS) + const nvrtcResult nativeResult = result; + if (nativeResult!=NVRTC_SUCCESS) return {nullptr,result}; const auto runtimeIncludeOptions = handler.getDefaultRuntimeIncludeOptions(); @@ -1206,30 +1209,29 @@ static SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, nvrtcResult r const auto* optionsBegin = options.empty() ? nullptr:options.data(); const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); - result = compileProgram(handler,program,{optionsBegin,optionsEnd}); - getProgramLog(handler,program,log); - if (result!=NVRTC_SUCCESS) + result = handler.compileProgram(program,{optionsBegin,optionsEnd}); + handler.getProgramLog(program,log); + if (static_cast(result)!=NVRTC_SUCCESS) return {nullptr,result}; - return getPTX(handler,program); + return handler.getPTX(program); } -SPTXResult compileDirectlyToPTX( - CCUDAHandler& handler, std::string&& source, const char* filename, core::SRange nvrtcOptions, +CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX( + std::string&& source, const char* filename, core::SRange nvrtcOptions, std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames) { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + cuda_interop::SNVRTCProgram program = {}; + cuda_interop::SNVRTCResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; auto cleanup = core::makeRAIIExiter([&]() -> void { - if (program) - handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program); + nvrtcProgram nativeProgram = program; + if (nativeProgram) + getNVRTCFunctionTable().pnvrtcDestroyProgram(&nativeProgram); }); - result = createProgram(handler,&program,std::move(source),filename,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(handler,result,program,nvrtcOptions,log); -} - + result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); + return compileDirectlyToPTX_impl(*this,result,program,nvrtcOptions,log); } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 54a710e48c..3743790a58 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -20,7 +20,7 @@ CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr dev cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const { - return cuda_interop::SNativeHandle(m_native->handle); + return m_native->handle; } bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const @@ -35,17 +35,17 @@ bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuff CUdeviceptr nativeMappedBuffer = 0; const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc); - if (!cuda_native::defaultHandleResult(*m_device->getHandler(),result)) + if (!m_device->getHandler()->defaultHandleResult(result)) return false; - *mappedBuffer = cuda_interop::SNativeHandle(nativeMappedBuffer); + *mappedBuffer = nativeMappedBuffer; return true; } CCUDAImportedMemory::~CCUDAImportedMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalMemory(m_native->handle)); + m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalMemory(m_native->handle)); } } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 1afd4a10b1..49495e11e2 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -19,13 +19,13 @@ CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr(m_native->handle); + return m_native->handle; } CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - cuda_native::defaultHandleResult(*m_device->getHandler(), cu.pcuDestroyExternalSemaphore(m_native->handle)); + m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalSemaphore(m_native->handle)); } } From 21d3b7ce6761ddd4744f310a3412d03a731f3e2e Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 19:57:28 +0200 Subject: [PATCH 119/149] Accept CUDA handler pointers in assert helper --- examples_tests | 2 +- include/nbl/video/CUDAInteropNativeAPI.h | 2 +- src/nbl/ext/CUDAInterop/README.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples_tests b/examples_tests index eb8f44a1b5..39441760d3 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit eb8f44a1b5ef38d1416a6fdc9a43e8e0215ec0bf +Subproject commit 39441760d335467158a340ad366302235ba6c30e diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h index d61ce32b67..e3cf7c6f78 100644 --- a/include/nbl/video/CUDAInteropNativeAPI.h +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -185,7 +185,7 @@ NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, #define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ do { \ const auto nblCudaInteropResult = (expr); \ - if (!(handler).defaultHandleResult(nblCudaInteropResult)) \ + if (!(handler)->defaultHandleResult(nblCudaInteropResult)) \ assert(false); \ } while (false) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 55db5cbd24..f5049a775a 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -71,7 +71,7 @@ auto memory = cudaDevice->createExportableMemory({ auto& cu = handler->getCUDAFunctionTable(); auto& nvrtc = handler->getNVRTCFunctionTable(); int driverVersion = 0; -NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), *handler); +NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), handler); CUdeviceptr mapped = 0; if (importedMemory) @@ -103,7 +103,7 @@ if (pcuNewCall) - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. - `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr` in SDK-free code. SDK opt-in code can pass `CUdeviceptr` directly. - `CCUDAHandler::createProgram`, `compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can call them with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`. -- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handlerRef)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a `CCUDAHandler&`. Nabla implementation code should still prefer explicit error handling and clean returns. +- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns. - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. Smoke examples: From dfca17ee1dc891ecb71451bd98c44f7a8dd9b8e8 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 20:04:14 +0200 Subject: [PATCH 120/149] Consolidate CUDA native handle declarations --- include/nbl/video/CUDAInteropNativeAPI.h | 31 +++++++++++------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h index e3cf7c6f78..6084d4a00c 100644 --- a/include/nbl/video/CUDAInteropNativeAPI.h +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -16,23 +16,20 @@ namespace nbl::video::cuda_interop { -template<> struct SOpaqueCUDANativeType { using type = CUdevice; }; -template<> struct SOpaqueCUDANativeType { using type = CUcontext; }; -template<> struct SOpaqueCUDANativeType { using type = CUdeviceptr; }; -template<> struct SOpaqueCUDANativeType { using type = CUexternalMemory; }; -template<> struct SOpaqueCUDANativeType { using type = CUexternalSemaphore; }; -template<> struct SOpaqueCUDANativeType { using type = CUresult; }; -template<> struct SOpaqueCUDANativeType { using type = nvrtcResult; }; -template<> struct SOpaqueCUDANativeType { using type = nvrtcProgram; }; - -static_assert(cuda_opaque_handle); -static_assert(cuda_opaque_handle); -static_assert(cuda_opaque_handle); -static_assert(cuda_opaque_handle); -static_assert(cuda_opaque_handle); -static_assert(cuda_opaque_handle); -static_assert(cuda_opaque_handle); -static_assert(cuda_opaque_handle); +#define NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(OPAQUE, NATIVE) \ + template<> struct SOpaqueCUDANativeType { using type = NATIVE; }; \ + static_assert(cuda_opaque_handle) + +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdevice, CUdevice); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUcontext, CUcontext); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdeviceptr, CUdeviceptr); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalMemory, CUexternalMemory); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalSemaphore, CUexternalSemaphore); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUresult, CUresult); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCResult, nvrtcResult); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCProgram, nvrtcProgram); + +#undef NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE } From 525315eeb9d3e614d565e0fba3b32382336989ac Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Sun, 10 May 2026 23:44:30 +0200 Subject: [PATCH 121/149] Tighten CUDA native output bridges --- include/nbl/video/CCUDAHandler.h | 13 ++---- include/nbl/video/CCUDAImportedMemory.h | 20 ++------- include/nbl/video/CUDAInteropHandles.h | 60 +++++++++++++++---------- src/nbl/ext/CUDAInterop/README.md | 4 +- 4 files changed, 46 insertions(+), 51 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 241f59ea5b..9975a7e212 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include namespace nbl::video { @@ -86,15 +86,10 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted bool defaultHandleResult(cuda_interop::SNVRTCResult result) const; cuda_interop::SNVRTCResult createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - template - requires (!std::is_same_v,cuda_interop::SNVRTCProgram>) - cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) + NBL_CUDA_INTEROP_NATIVE_FOR(Program, cuda_interop::SNVRTCProgram) + inline cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) { - cuda_interop::SNVRTCProgram opaqueProgram = {}; - const auto result = createProgram(&opaqueProgram,std::move(source),name,headerCount,headerContents,includeNames); - if (prog) - *prog = static_cast(opaqueProgram); - return result; + return createProgram(cuda_interop::asOpaqueOutput(prog),std::move(source),name,headerCount,headerContents,includeNames); } cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const; cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const; diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 5cdb1bb3f6..720ae30b3d 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -5,8 +5,6 @@ #include "nbl/video/CUDAInteropHandles.h" #include -#include -#include namespace nbl::video { @@ -19,22 +17,10 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted ~CCUDAImportedMemory() override; cuda_interop::SCUexternalMemory getInternalObject() const; bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const; - bool getMappedBuffer(cuda_interop::SCUdeviceptr& mappedBuffer) const { return getMappedBuffer(&mappedBuffer); } - template - requires (!std::is_same_v,cuda_interop::SCUdeviceptr>) - bool getMappedBuffer(DevicePtr* mappedBuffer) const + NBL_CUDA_INTEROP_NATIVE_FOR(DevicePtr, cuda_interop::SCUdeviceptr) + inline bool getMappedBuffer(DevicePtr& mappedBuffer) const { - cuda_interop::SCUdeviceptr opaqueMappedBuffer = {}; - const auto result = getMappedBuffer(&opaqueMappedBuffer); - if (result && mappedBuffer) - *mappedBuffer = static_cast(opaqueMappedBuffer); - return result; - } - template - requires (!std::is_same_v,cuda_interop::SCUdeviceptr>) - bool getMappedBuffer(DevicePtr& mappedBuffer) const - { - return getMappedBuffer(&mappedBuffer); + return getMappedBuffer(cuda_interop::asOpaqueOutput(mappedBuffer)); } private: diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index b9e5be244b..3b555b599f 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -7,7 +7,6 @@ #include #include #include -#include #include namespace nbl::video::cuda_interop @@ -34,8 +33,32 @@ concept cuda_opaque_handle = template concept cuda_native_handle_for = requires { typename SOpaqueCUDANativeType::type; } && - std::same_as,typename SOpaqueCUDANativeType::type> && - cuda_opaque_handle>; + std::same_as,typename SOpaqueCUDANativeType::type> && + cuda_opaque_handle>; + +template +requires cuda_native_handle_for +Opaque* asOpaqueOutput(Native* native) +{ + return reinterpret_cast(native); +} + +template +requires cuda_native_handle_for +Opaque* asOpaqueOutput(Native& native) +{ + return asOpaqueOutput(&native); +} + +/* + Declare a narrow native-reference bridge for SDK opt-in code. Value conversions make SCU* handles usable as + native CUDA handles after CUDAInteropNative.h is included, but output parameters still need a writable object + whose storage matches the opaque handle. Use asOpaqueOutput inside such bridge overloads. This macro keeps + them short and constrained to the exact SDK type validated for the opaque handle. +*/ +#define NBL_CUDA_INTEROP_NATIVE_FOR(TYPE, OPAQUE) \ + template \ + requires ::nbl::video::cuda_interop::cuda_native_handle_for template struct alignas(alignof(Storage)) SOpaqueCUDAHandle @@ -53,47 +76,38 @@ struct alignas(alignof(Storage)) SOpaqueCUDAHandle template requires cuda_native_handle_for - Derived& operator=(const Native& native) - { - std::memcpy(value,&native,sizeof(native)); - return static_cast(*this); - } - - template - requires cuda_native_handle_for - operator Native() const + operator Native&() { - Native native = {}; - std::memcpy(&native,value,sizeof(native)); - return native; + return *reinterpret_cast(value); } template requires cuda_native_handle_for - friend bool operator==(const Derived& lhs, const Native& rhs) + operator const Native&() const { - return static_cast(lhs)==rhs; + return *reinterpret_cast(value); } template requires cuda_native_handle_for - friend bool operator==(const Native& lhs, const Derived& rhs) + Derived& operator=(const Native& native) { - return lhs==static_cast(rhs); + static_cast(*this) = native; + return static_cast(*this); } template requires cuda_native_handle_for - friend bool operator!=(const Derived& lhs, const Native& rhs) + friend bool operator==(const Derived& lhs, const Native& rhs) { - return !(lhs==rhs); + return static_cast(lhs)==rhs; } template requires cuda_native_handle_for - friend bool operator!=(const Native& lhs, const Derived& rhs) + friend bool operator==(const Native& lhs, const Derived& rhs) { - return !(lhs==rhs); + return lhs==static_cast(rhs); } }; diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index f5049a775a..1ebeb79a48 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -101,8 +101,8 @@ if (pcuNewCall) - `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`. - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. -- `CCUDAImportedMemory::getMappedBuffer` writes an opaque `cuda_interop::SCUdeviceptr` in SDK-free code. SDK opt-in code can pass `CUdeviceptr` directly. -- `CCUDAHandler::createProgram`, `compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can call them with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`. +- SDK-free output parameters stay pointer-based. SDK opt-in code can pass native CUDA output variables directly through small inline bridge overloads. +- `CCUDAHandler::compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can use their results with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`. - `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns. - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. From d8d4c3b8d6e2e79ab49bb56c8b6e69b04de02624 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 06:37:07 +0200 Subject: [PATCH 122/149] Centralize CUDA output bridge --- include/nbl/video/CCUDAHandler.h | 8 +-- include/nbl/video/CCUDAImportedMemory.h | 7 +-- include/nbl/video/CUDAInteropHandles.h | 57 ++++++++++++------- src/nbl/ext/CUDAInterop/README.md | 5 +- .../ext/CUDAInterop/smoke/native_opt_in.cpp | 17 ++++++ src/nbl/video/CCUDAHandler.cpp | 4 +- src/nbl/video/CCUDAImportedMemory.cpp | 4 +- 7 files changed, 64 insertions(+), 38 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 9975a7e212..9af65ff25b 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,7 +16,6 @@ #include #include #include -#include namespace nbl::video { @@ -85,12 +84,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted bool defaultHandleResult(cuda_interop::SCUresult result) const; bool defaultHandleResult(cuda_interop::SNVRTCResult result) const; - cuda_interop::SNVRTCResult createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - NBL_CUDA_INTEROP_NATIVE_FOR(Program, cuda_interop::SNVRTCProgram) - inline cuda_interop::SNVRTCResult createProgram(Program* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - return createProgram(cuda_interop::asOpaqueOutput(prog),std::move(source),name,headerCount,headerContents,includeNames); - } + cuda_interop::SNVRTCResult createProgram(cuda_interop::SOutput prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const; cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const; SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const; diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 720ae30b3d..0266706480 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -16,12 +16,7 @@ class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted public: ~CCUDAImportedMemory() override; cuda_interop::SCUexternalMemory getInternalObject() const; - bool getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const; - NBL_CUDA_INTEROP_NATIVE_FOR(DevicePtr, cuda_interop::SCUdeviceptr) - inline bool getMappedBuffer(DevicePtr& mappedBuffer) const - { - return getMappedBuffer(cuda_interop::asOpaqueOutput(mappedBuffer)); - } + bool getMappedBuffer(cuda_interop::SOutput mappedBuffer) const; private: friend class CCUDADevice; diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index 3b555b599f..c0002f5cc9 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -36,29 +36,46 @@ concept cuda_native_handle_for = std::same_as,typename SOpaqueCUDANativeType::type> && cuda_opaque_handle>; -template -requires cuda_native_handle_for -Opaque* asOpaqueOutput(Native* native) -{ - return reinterpret_cast(native); -} +/* + Output bridge for SDK-free APIs that write CUDA/NVRTC handles. -template -requires cuda_native_handle_for -Opaque* asOpaqueOutput(Native& native) -{ - return asOpaqueOutput(&native); -} + Value conversions in SOpaqueCUDAHandle are enough for inputs and return values, but C++ does not apply those + user-defined conversions through output pointers or mutable output references. This type centralizes that one + boundary case. Without it, every Nabla method that writes a native CUDA/NVRTC handle would need a separate + SDK-typed overload, or SDK opt-in callers would have to spell the SDK-free SCU* type manually. With SOutput, + Nabla methods keep one SDK-free signature while SDK opt-in callers still use raw CUDA spelling: -/* - Declare a narrow native-reference bridge for SDK opt-in code. Value conversions make SCU* handles usable as - native CUDA handles after CUDAInteropNative.h is included, but output parameters still need a writable object - whose storage matches the opaque handle. Use asOpaqueOutput inside such bridge overloads. This macro keeps - them short and constrained to the exact SDK type validated for the opaque handle. + CUdeviceptr ptr = 0; + importedMemory->getMappedBuffer(ptr); + nvrtcProgram program = nullptr; + handler->createProgram(program,std::move(source),"kernel.cu"); + + SDK-free callers can pass SCU* objects or SCU* pointers. SDK opt-in callers can pass the matching native + CUDA/NVRTC object or pointer after CUDAInteropNative.h specializes SOpaqueCUDANativeType for the selected SDK. */ -#define NBL_CUDA_INTEROP_NATIVE_FOR(TYPE, OPAQUE) \ - template \ - requires ::nbl::video::cuda_interop::cuda_native_handle_for +template +struct SOutput +{ + SOutput(std::nullptr_t) : ptr(nullptr) {} + SOutput(Opaque& opaque) : ptr(&opaque) {} + SOutput(Opaque* opaque) : ptr(opaque) {} + + template + requires cuda_native_handle_for + SOutput(Native& native) : ptr(reinterpret_cast(&native)) {} + + template + requires cuda_native_handle_for + SOutput(Native* native) : ptr(reinterpret_cast(native)) {} + + Opaque* get() const { return ptr; } + Opaque& operator*() const { return *ptr; } + operator Opaque*() const { return ptr; } + explicit operator bool() const { return ptr!=nullptr; } + + private: + Opaque* ptr; +}; template struct alignas(alignof(Storage)) SOpaqueCUDAHandle diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index 1ebeb79a48..b764dcea93 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -79,6 +79,9 @@ if (importedMemory) CUdeviceptr exported = memory->getDeviceptr(); +nvrtcProgram program = nullptr; +auto createResult = handler->createProgram(program, std::string(cudaSource), "kernel.cu"); + std::string log; auto compile = handler->compileDirectlyToPTX( std::move(cudaSource), @@ -101,7 +104,7 @@ if (pcuNewCall) - `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`. - CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. -- SDK-free output parameters stay pointer-based. SDK opt-in code can pass native CUDA output variables directly through small inline bridge overloads. +- SDK-free output parameters use `cuda_interop::SOutput<...>`. SDK-free code can pass opaque `SCU*` values or pointers. SDK opt-in code can pass matching native CUDA/NVRTC output variables directly, for example `CUdeviceptr mapped; importedMemory->getMappedBuffer(mapped);` or `nvrtcProgram program; handler->createProgram(program, ...)`. - `CCUDAHandler::compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can use their results with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`. - `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns. - `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index c2f9a97ac4..71f2d3e7b9 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -111,6 +112,19 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) ); return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u; } + +bool nativeNVRTCOutputProbe(CCUDAHandler& handler) +{ + constexpr const char* Source = R"cuda( + extern "C" __global__ void native_output_probe() {} + )cuda"; + + nvrtcProgram program = nullptr; + const auto result = handler.createProgram(program,std::string(Source),"native_output_probe.cu"); + if (program) + handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program); + return result==NVRTC_SUCCESS; +} } class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework @@ -154,6 +168,9 @@ class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramew if (!pcuDriverGetVersion || pcuDriverGetVersion(&loadedDriverVersion)!=CUDA_SUCCESS || loadedDriverVersion==0) return false; + if (!nativeNVRTCOutputProbe(*handler)) + return false; + if (!cudaFp16HeaderCompileProbe(*handler)) return false; diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 094046ea6c..f81e6e6ade 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -1137,7 +1137,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste ); } -cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SNVRTCProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SOutput prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -1230,7 +1230,7 @@ CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX( getNVRTCFunctionTable().pnvrtcDestroyProgram(&nativeProgram); }); - result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); + result = createProgram(program,std::move(source),filename,headerCount,headerContents,includeNames); return compileDirectlyToPTX_impl(*this,result,program,nvrtcOptions,log); } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 3743790a58..ec5438643f 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -23,7 +23,7 @@ cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const return m_native->handle; } -bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr* mappedBuffer) const +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput mappedBuffer) const { if (!mappedBuffer) return false; @@ -73,7 +73,7 @@ cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const return {}; } -bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SCUdeviceptr*) const +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput) const { return false; } From fe3fd663ab41c63b64c07acd786566e30e753ded Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 08:40:13 +0200 Subject: [PATCH 123/149] Document CUDA interop handles --- include/nbl/video/CUDAInteropHandles.h | 37 +++++++++++--------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h index c0002f5cc9..a7664310aa 100644 --- a/include/nbl/video/CUDAInteropHandles.h +++ b/include/nbl/video/CUDAInteropHandles.h @@ -13,12 +13,17 @@ namespace nbl::video::cuda_interop { /* - SDK-free CUDA handle surrogates used by Nabla's public video API. + SDK-free CUDA interop boundary. - These types are the small glue layer between Nabla and SDK-typed CUDA interop code. They let nbl/video/CCUDA*.h - expose CUDA-related objects without including cuda.h or nvrtc.h, so consumers that only link Nabla::Nabla do - not inherit CUDA SDK as a public compile-time dependency. CUDAInteropNative.h maps these opaque handles back - to the real CU* types and checks their size/alignment against the SDK selected by the opt-in consumer. + Public nbl/video/CCUDA*.h headers cannot include cuda.h or nvrtc.h, but they still need to carry CUDA interop + state and write CUDA/NVRTC handles for opt-in users. The split below keeps those two roles explicit: + - SOpaqueCUDAHandle owns handle bits and is used in Nabla object layout, parameters, and return values. + - SOutput is a non-owning output adapter. C++ does not apply user-defined conversions through T* or mutable T&, + so output parameters need a small bridge to write directly into either SCU* storage or native SDK storage. + + CUDAInteropNative.h is the only header that maps these opaque types back to CUDA/NVRTC SDK types. These helpers + are class templates with in-class member definitions, so they are inline by the language rules and add no exported + symbols. */ template struct SOpaqueCUDANativeType; @@ -37,21 +42,8 @@ concept cuda_native_handle_for = cuda_opaque_handle>; /* - Output bridge for SDK-free APIs that write CUDA/NVRTC handles. - - Value conversions in SOpaqueCUDAHandle are enough for inputs and return values, but C++ does not apply those - user-defined conversions through output pointers or mutable output references. This type centralizes that one - boundary case. Without it, every Nabla method that writes a native CUDA/NVRTC handle would need a separate - SDK-typed overload, or SDK opt-in callers would have to spell the SDK-free SCU* type manually. With SOutput, - Nabla methods keep one SDK-free signature while SDK opt-in callers still use raw CUDA spelling: - - CUdeviceptr ptr = 0; - importedMemory->getMappedBuffer(ptr); - nvrtcProgram program = nullptr; - handler->createProgram(program,std::move(source),"kernel.cu"); - - SDK-free callers can pass SCU* objects or SCU* pointers. SDK opt-in callers can pass the matching native - CUDA/NVRTC object or pointer after CUDAInteropNative.h specializes SOpaqueCUDANativeType for the selected SDK. + Non-owning output bridge for SDK-free APIs. It keeps one Nabla signature while opt-in callers can pass raw + CUDA/NVRTC output variables directly, e.g. `CUdeviceptr ptr; memory->getMappedBuffer(ptr);`. */ template struct SOutput @@ -68,7 +60,6 @@ struct SOutput requires cuda_native_handle_for SOutput(Native* native) : ptr(reinterpret_cast(native)) {} - Opaque* get() const { return ptr; } Opaque& operator*() const { return *ptr; } operator Opaque*() const { return ptr; } explicit operator bool() const { return ptr!=nullptr; } @@ -77,6 +68,10 @@ struct SOutput Opaque* ptr; }; +/* + Owned opaque value used in public Nabla ABI. Native reference conversions become available only after the opt-in + header specializes SOpaqueCUDANativeType for the selected CUDA SDK. +*/ template struct alignas(alignof(Storage)) SOpaqueCUDAHandle { From d5dfadefad18f18e657d96fc06b771568a161bbf Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 09:27:44 +0200 Subject: [PATCH 124/149] Make CUDA PTX compile log optional --- examples_tests | 2 +- include/nbl/video/CCUDAHandler.h | 2 +- src/nbl/ext/CUDAInterop/README.md | 2 +- src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp | 2 +- src/nbl/video/CCUDAHandler.cpp | 10 ++++++---- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/examples_tests b/examples_tests index 39441760d3..b4a8725d54 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 39441760d335467158a340ad366302235ba6c30e +Subproject commit b4a8725d54ca960e0d2c353ef08d5f40aa4c4e04 diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 9af65ff25b..4d2324cfa6 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -90,7 +90,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const; SPTXResult compileDirectlyToPTX( std::string&& source, const char* filename, core::SRange nvrtcOptions, - std::string& log, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr + std::string* log=nullptr, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr ); inline core::SRange getSTDHeaders() diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md index b764dcea93..0d8ebe2f08 100644 --- a/src/nbl/ext/CUDAInterop/README.md +++ b/src/nbl/ext/CUDAInterop/README.md @@ -87,7 +87,7 @@ auto compile = handler->compileDirectlyToPTX( std::move(cudaSource), "kernel.cu", cudaDevice->geDefaultCompileOptions(), - log + &log ); ``` diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp index 71f2d3e7b9..d1c15822cd 100644 --- a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -105,7 +105,7 @@ bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) std::string(Source), "cuda_fp16_discovery_probe.cu", {nullptr,nullptr}, - log, + &log, 0, nullptr, nullptr diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index f81e6e6ade..c07af698b1 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -1192,9 +1192,10 @@ CCUDAHandler::SPTXResult CCUDAHandler::getPTX(cuda_interop::SNVRTCProgram prog) return {std::move(ptx),nvrtc.pnvrtcGetPTX(nativeProgram,ptxPtr)}; } -static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange nvrtcOptions, std::string& log) +static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange nvrtcOptions, std::string* log) { - log.clear(); + if (log) + log->clear(); const nvrtcResult nativeResult = result; if (nativeResult!=NVRTC_SUCCESS) return {nullptr,result}; @@ -1210,7 +1211,8 @@ static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, const auto* optionsBegin = options.empty() ? nullptr:options.data(); const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); result = handler.compileProgram(program,{optionsBegin,optionsEnd}); - handler.getProgramLog(program,log); + if (log) + handler.getProgramLog(program,*log); if (static_cast(result)!=NVRTC_SUCCESS) return {nullptr,result}; @@ -1219,7 +1221,7 @@ static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX( std::string&& source, const char* filename, core::SRange nvrtcOptions, - std::string& log, const int headerCount, const char* const* headerContents, const char* const* includeNames) + std::string* log, const int headerCount, const char* const* headerContents, const char* const* includeNames) { cuda_interop::SNVRTCProgram program = {}; cuda_interop::SNVRTCResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; From 2d53e9af42dea6f618c5525199f3242ea5058fad Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 09:56:09 +0200 Subject: [PATCH 125/149] Enable CUDA in Windows CI --- .github/workflows/build-nabla.yml | 154 +++++++++++++++++++++++++++++- CMakePresets.json | 2 +- 2 files changed, 154 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 8a62da4fc7..dd782ec389 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -47,9 +47,95 @@ jobs: } & $rgExe --version + prepare-host-cuda: + name: Prepare host CUDA 13.2 + runs-on: windows-2022 + + env: + cudaVersion: '13.2.1' + cudaMajorMinor: '13.2' + cudaInstallRoot: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2 + cudaCacheRoot: ${{ runner.tool_cache }}\cuda\v13.2 + + steps: + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ env.cudaCacheRoot }} + key: cuda-toolkit-${{ env.cudaVersion }}-windows-x64-v1 + + - name: Install CUDA Toolkit + if: steps.cache-cuda.outputs.cache-hit != 'true' + shell: pwsh + run: | + function Test-CudaToolkit { + param([string]$Root) + + $nvcc = Join-Path $Root 'bin\nvcc.exe' + $cudaH = Join-Path $Root 'include\cuda.h' + $nvrtcH = Join-Path $Root 'include\nvrtc.h' + if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { + return $false + } + + $version = & $nvcc --version 2>&1 + return ($LASTEXITCODE -eq 0 -and ($version -match "release $env:cudaMajorMinor")) + } + + if (Test-CudaToolkit $env:cudaCacheRoot) { + Write-Host "CUDA Toolkit $env:cudaMajorMinor already restored at $env:cudaCacheRoot" + return + } + + if (-not (Test-CudaToolkit $env:cudaInstallRoot)) { + if (-not (Get-Command winget -ErrorAction SilentlyContinue)) { + throw "winget is required to install CUDA Toolkit $env:cudaVersion on the host runner." + } + + winget install ` + --exact ` + --id Nvidia.CUDA ` + --version $env:cudaVersion ` + --source winget ` + --accept-package-agreements ` + --accept-source-agreements ` + --disable-interactivity + + if ($LASTEXITCODE -ne 0) { + throw "CUDA Toolkit $env:cudaVersion installation failed." + } + } + + if (-not (Test-CudaToolkit $env:cudaInstallRoot)) { + throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation." + } + + New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null + robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP + if ($LASTEXITCODE -gt 7) { + throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE" + } + $global:LASTEXITCODE = 0 + + - name: Verify CUDA Toolkit + shell: pwsh + run: | + $nvcc = Join-Path $env:cudaCacheRoot 'bin\nvcc.exe' + $cudaH = Join-Path $env:cudaCacheRoot 'include\cuda.h' + $nvrtcH = Join-Path $env:cudaCacheRoot 'include\nvrtc.h' + if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { + throw "CUDA Toolkit cache is incomplete at $env:cudaCacheRoot." + } + $version = & $nvcc --version + if ($LASTEXITCODE -ne 0 -or -not ($version -match "release $env:cudaMajorMinor")) { + throw "Expected CUDA Toolkit $env:cudaMajorMinor. nvcc output: $version" + } + $version + build-windows: name: Nabla (${{ matrix.os }}, ${{ matrix.vendor }}-${{ matrix.tag }}, ${{ matrix.config }}) - needs: prepare-host-rg + needs: [prepare-host-rg, prepare-host-cuda] runs-on: ${{ matrix.os }} env: @@ -59,6 +145,9 @@ jobs: mount: C:\mount\nabla binary: C:\mount\nabla\build-ct install: build-ct\install + cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2 + cudaContainerRoot: C:\cuda\v13.2 + cudaContainerRootCMake: C:/cuda/v13.2 strategy: fail-fast: false @@ -183,6 +272,28 @@ jobs: $rgDir | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append & $rgExe --version + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ env.cudaHostRoot }} + key: cuda-toolkit-13.2.1-windows-x64-v1 + + - name: Verify CUDA Toolkit + shell: pwsh + run: | + $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe' + $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h' + $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h' + if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { + throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}." + } + $version = & $nvcc --version + if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) { + throw "Expected CUDA Toolkit 13.2. nvcc output: $version" + } + $version + - name: Pull Image run: | docker pull "${{ env.image }}:${{ matrix.tag }}" @@ -199,8 +310,11 @@ jobs: --env-file .\docker\ci-windows.env ` --env-file .\docker\ninja.env ` --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" ` + --env "CUDA_PATH=${{ env.cudaContainerRoot }}" ` + --env "CUDA_PATH_V13_2=${{ env.cudaContainerRoot }}" ` --name orphan --network docker_default ` -v "${{ github.workspace }}:${{ env.mount }}" ` + -v "${{ env.cudaHostRoot }}:${{ env.cudaContainerRoot }}" ` -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` -w "${{ env.mount }}" ` "${{ env.image }}:${{ matrix.tag }}" ` @@ -222,6 +336,7 @@ jobs: ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install ` + -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.cudaContainerRootCMake }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace @@ -623,6 +738,8 @@ jobs: name: Nabla / Smoke (${{ matrix.os }}, ${{ matrix.vendor }}-latest, ${{ matrix.config }}) needs: build-windows runs-on: windows-2022 + env: + cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2 strategy: fail-fast: false matrix: @@ -636,6 +753,7 @@ jobs: fetch-depth: 1 sparse-checkout: | smoke + src/nbl/ext/CUDAInterop/smoke - name: Download VulkanSDK uses: Devsh-Graphics-Programming/install-vulkan-sdk-action@v1.4.0-devsh.1 @@ -646,6 +764,28 @@ jobs: install_lavapipe: true github_token: ${{ github.token }} + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ env.cudaHostRoot }} + key: cuda-toolkit-13.2.1-windows-x64-v1 + + - name: Verify CUDA Toolkit + shell: pwsh + run: | + $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe' + $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h' + $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h' + if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { + throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}." + } + $version = & $nvcc --version + if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) { + throw "Expected CUDA Toolkit 13.2. nvcc output: $version" + } + $version + - name: Download Nabla install artifact uses: actions/download-artifact@v8 with: @@ -668,3 +808,15 @@ jobs: - name: Smoke Flow BUILD_ONLY run: cmake -D FLOW=BUILD_ONLY -D CONFIG=${{ matrix.config }} -P smoke/RunSmokeFlow.cmake + + - name: Build CUDA interop package smoke + shell: pwsh + run: | + cmake ` + -S src/nbl/ext/CUDAInterop/smoke ` + -B smoke/cuda-interop-smoke ` + -D "CMAKE_PREFIX_PATH=${{ github.workspace }}\smoke\build-ct\install\cmake" ` + -D "NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE=ON" ` + -D "Nabla_CUDA_TOOLKIT_ROOT=${{ env.cudaHostRoot }}" + + cmake --build smoke/cuda-interop-smoke --config ${{ matrix.config }} diff --git a/CMakePresets.json b/CMakePresets.json index 3c11567f46..2c25d06953 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -14,7 +14,7 @@ "NBL_EMBED_BUILTIN_RESOURCES": "ON", "NBL_NSC_MODE": "SOURCE", "NBL_UPDATE_GIT_SUBMODULE": "OFF", - "NBL_COMPILE_WITH_CUDA": "OFF", + "NBL_COMPILE_WITH_CUDA": "ON", "NBL_BUILD_OPTIX": "OFF", "NBL_BUILD_MITSUBA_LOADER": "ON", "NBL_BUILD_RADEON_RAYS": "OFF", From 0243ed07664ffb222d7e628e0414891dd75dc2c6 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 10:31:57 +0200 Subject: [PATCH 126/149] Fix CUDA cache path in CI --- .github/workflows/build-nabla.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index dd782ec389..865fe7a0fc 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -55,7 +55,7 @@ jobs: cudaVersion: '13.2.1' cudaMajorMinor: '13.2' cudaInstallRoot: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2 - cudaCacheRoot: ${{ runner.tool_cache }}\cuda\v13.2 + cudaCacheRoot: C:\nabla-ci\cuda\v13.2 steps: - name: Restore CUDA Toolkit @@ -145,7 +145,7 @@ jobs: mount: C:\mount\nabla binary: C:\mount\nabla\build-ct install: build-ct\install - cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2 + cudaHostRoot: C:\nabla-ci\cuda\v13.2 cudaContainerRoot: C:\cuda\v13.2 cudaContainerRootCMake: C:/cuda/v13.2 @@ -739,7 +739,7 @@ jobs: needs: build-windows runs-on: windows-2022 env: - cudaHostRoot: ${{ runner.tool_cache }}\cuda\v13.2 + cudaHostRoot: C:\nabla-ci\cuda\v13.2 strategy: fail-fast: false matrix: From 4ea20f7c7caca2dc75f8ba90d8f636eb31d8285b Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 11:26:32 +0200 Subject: [PATCH 127/149] Seed CUDA cache on Windows 2025 --- .github/workflows/build-nabla.yml | 62 ++++++++++++++++--------------- 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 865fe7a0fc..aae2173fb8 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -49,7 +49,7 @@ jobs: prepare-host-cuda: name: Prepare host CUDA 13.2 - runs-on: windows-2022 + runs-on: windows-2025 env: cudaVersion: '13.2.1' @@ -63,7 +63,7 @@ jobs: uses: actions/cache@v5 with: path: ${{ env.cudaCacheRoot }} - key: cuda-toolkit-${{ env.cudaVersion }}-windows-x64-v1 + key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-v1 - name: Install CUDA Toolkit if: steps.cache-cuda.outputs.cache-hit != 'true' @@ -75,7 +75,9 @@ jobs: $nvcc = Join-Path $Root 'bin\nvcc.exe' $cudaH = Join-Path $Root 'include\cuda.h' $nvrtcH = Join-Path $Root 'include\nvrtc.h' - if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { + $fp16H = Join-Path $Root 'include\cuda_fp16.h' + $vectorTypesH = Join-Path $Root 'include\vector_types.h' + if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) { return $false } @@ -88,35 +90,33 @@ jobs: return } - if (-not (Test-CudaToolkit $env:cudaInstallRoot)) { - if (-not (Get-Command winget -ErrorAction SilentlyContinue)) { - throw "winget is required to install CUDA Toolkit $env:cudaVersion on the host runner." - } - - winget install ` - --exact ` - --id Nvidia.CUDA ` - --version $env:cudaVersion ` - --source winget ` - --accept-package-agreements ` - --accept-source-agreements ` - --disable-interactivity - - if ($LASTEXITCODE -ne 0) { - throw "CUDA Toolkit $env:cudaVersion installation failed." - } + winget source update + winget install ` + --exact ` + --id Nvidia.CUDA ` + --version $env:cudaVersion ` + --source winget ` + --location $env:cudaCacheRoot ` + --accept-package-agreements ` + --accept-source-agreements ` + --disable-interactivity + + if ($LASTEXITCODE -ne 0) { + throw "CUDA Toolkit $env:cudaVersion installation failed." } - if (-not (Test-CudaToolkit $env:cudaInstallRoot)) { - throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation." + if (-not (Test-CudaToolkit $env:cudaCacheRoot) -and (Test-CudaToolkit $env:cudaInstallRoot)) { + New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null + robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP + if ($LASTEXITCODE -gt 7) { + throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE" + } + $global:LASTEXITCODE = 0 } - New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null - robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP - if ($LASTEXITCODE -gt 7) { - throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE" + if (-not (Test-CudaToolkit $env:cudaCacheRoot)) { + throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaCacheRoot after installation." } - $global:LASTEXITCODE = 0 - name: Verify CUDA Toolkit shell: pwsh @@ -124,7 +124,9 @@ jobs: $nvcc = Join-Path $env:cudaCacheRoot 'bin\nvcc.exe' $cudaH = Join-Path $env:cudaCacheRoot 'include\cuda.h' $nvrtcH = Join-Path $env:cudaCacheRoot 'include\nvrtc.h' - if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { + $fp16H = Join-Path $env:cudaCacheRoot 'include\cuda_fp16.h' + $vectorTypesH = Join-Path $env:cudaCacheRoot 'include\vector_types.h' + if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) { throw "CUDA Toolkit cache is incomplete at $env:cudaCacheRoot." } $version = & $nvcc --version @@ -277,7 +279,7 @@ jobs: uses: actions/cache@v5 with: path: ${{ env.cudaHostRoot }} - key: cuda-toolkit-13.2.1-windows-x64-v1 + key: cuda-toolkit-13.2.1-windows-2025-x64-v1 - name: Verify CUDA Toolkit shell: pwsh @@ -769,7 +771,7 @@ jobs: uses: actions/cache@v5 with: path: ${{ env.cudaHostRoot }} - key: cuda-toolkit-13.2.1-windows-x64-v1 + key: cuda-toolkit-13.2.1-windows-2025-x64-v1 - name: Verify CUDA Toolkit shell: pwsh From 85fbf7f9bc36e4fd15751344c1427a6176421243 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 12:11:12 +0200 Subject: [PATCH 128/149] Use Choco for CUDA cache seed --- .github/workflows/build-nabla.yml | 34 ++++++++++++++----------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index aae2173fb8..d8efabd3a4 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -63,7 +63,7 @@ jobs: uses: actions/cache@v5 with: path: ${{ env.cudaCacheRoot }} - key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-v1 + key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-choco-v1 - name: Install CUDA Toolkit if: steps.cache-cuda.outputs.cache-hit != 'true' @@ -90,29 +90,25 @@ jobs: return } - winget source update - winget install ` - --exact ` - --id Nvidia.CUDA ` + choco install cuda ` --version $env:cudaVersion ` - --source winget ` - --location $env:cudaCacheRoot ` - --accept-package-agreements ` - --accept-source-agreements ` - --disable-interactivity + --yes ` + --no-progress if ($LASTEXITCODE -ne 0) { throw "CUDA Toolkit $env:cudaVersion installation failed." } - if (-not (Test-CudaToolkit $env:cudaCacheRoot) -and (Test-CudaToolkit $env:cudaInstallRoot)) { - New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null - robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP - if ($LASTEXITCODE -gt 7) { - throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE" - } - $global:LASTEXITCODE = 0 + if (-not (Test-CudaToolkit $env:cudaInstallRoot)) { + throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation." + } + + New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null + robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP + if ($LASTEXITCODE -gt 7) { + throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE" } + $global:LASTEXITCODE = 0 if (-not (Test-CudaToolkit $env:cudaCacheRoot)) { throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaCacheRoot after installation." @@ -279,7 +275,7 @@ jobs: uses: actions/cache@v5 with: path: ${{ env.cudaHostRoot }} - key: cuda-toolkit-13.2.1-windows-2025-x64-v1 + key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1 - name: Verify CUDA Toolkit shell: pwsh @@ -771,7 +767,7 @@ jobs: uses: actions/cache@v5 with: path: ${{ env.cudaHostRoot }} - key: cuda-toolkit-13.2.1-windows-2025-x64-v1 + key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1 - name: Verify CUDA Toolkit shell: pwsh From 6008285457ae6f1a6bd632a4755f8e4ddc33802b Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 13:50:47 +0200 Subject: [PATCH 129/149] Update CUDA interop examples pointer --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index b4a8725d54..10022c5de1 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit b4a8725d54ca960e0d2c353ef08d5f40aa4c4e04 +Subproject commit 10022c5de1b8350b8a4c85c35871bcd84e4877a7 From 82d82a26138f43591f2c3ea3af2b7a287169181a Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 15:20:59 +0200 Subject: [PATCH 130/149] Update CUDA interop examples pointer --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 10022c5de1..39d02e2602 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 10022c5de1b8350b8a4c85c35871bcd84e4877a7 +Subproject commit 39d02e26023c72a7d3241e5df85e9b7c4afacb84 From 828211c1cbe23b5662877ae446162a26108eb21c Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Mon, 11 May 2026 15:38:01 +0200 Subject: [PATCH 131/149] Retry CI image pull --- .github/workflows/build-nabla.yml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index d8efabd3a4..7e89d68e15 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -294,7 +294,16 @@ jobs: - name: Pull Image run: | - docker pull "${{ env.image }}:${{ matrix.tag }}" + $image = "${{ env.image }}:${{ matrix.tag }}" + for ($attempt = 1; $attempt -le 3; $attempt++) { + docker pull $image + if ($LASTEXITCODE -eq 0) { + exit 0 + } + Write-Warning "docker pull failed for $image on attempt $attempt." + Start-Sleep -Seconds (15 * $attempt) + } + exit $LASTEXITCODE - name: Run Container run: | From e913518df6105ef947101c4320bff066ea679a45 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 12 May 2026 07:16:25 +0200 Subject: [PATCH 132/149] Deduplicate CUDA CI setup --- .github/scripts/ci_cuda_toolkit.py | 155 +++++++++++++++++++++++++++++ .github/workflows/build-nabla.yml | 149 ++++++++------------------- 2 files changed, 194 insertions(+), 110 deletions(-) create mode 100644 .github/scripts/ci_cuda_toolkit.py diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py new file mode 100644 index 0000000000..ee76eaf0b5 --- /dev/null +++ b/.github/scripts/ci_cuda_toolkit.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +import argparse +import os +import platform +import subprocess +from pathlib import Path + + +REQUIRED_HEADERS = ( + "cuda.h", + "nvrtc.h", + "cuda_fp16.h", + "vector_types.h", +) + + +def cuda_version() -> str: + version = os.environ.get("CUDA_VERSION", "").strip() + if not version: + raise SystemExit("CUDA_VERSION is not set.") + parts = version.split(".") + if len(parts) < 2 or not all(part.isdigit() for part in parts[:2]): + raise SystemExit(f"CUDA_VERSION must start with major.minor, got: {version}") + return version + + +def major_minor(version: str) -> str: + major, minor, *_ = version.split(".") + return f"{major}.{minor}" + + +def windows_paths(version: str) -> dict[str, str]: + mm = major_minor(version) + major, minor = mm.split(".") + return { + "cache_root": rf"C:\nabla-ci\cuda\v{mm}", + "container_root": rf"C:\cuda\v{mm}", + "container_root_cmake": f"C:/cuda/v{mm}", + "version_env": f"CUDA_PATH_V{major}_{minor}", + "cache_key": f"cuda-toolkit-{version}-windows-2025-x64-choco-v1", + } + + +def windows_install_root(version: str) -> Path: + return Path(rf"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v{major_minor(version)}") + + +def paths() -> dict[str, str]: + version = cuda_version() + if platform.system() == "Windows": + return windows_paths(version) + mm = major_minor(version) + return { + "cache_root": f"/opt/nabla-ci/cuda/v{mm}", + "container_root": f"/cuda/v{mm}", + "container_root_cmake": f"/cuda/v{mm}", + "version_env": f"CUDA_PATH_V{mm.replace('.', '_')}", + "cache_key": f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1", + } + + +def emit_outputs() -> None: + output = os.environ.get("GITHUB_OUTPUT") + values = paths() + lines = [f"{key}={value}" for key, value in values.items()] + if output: + with open(output, "a", encoding="utf-8") as file: + file.write("\n".join(lines)) + file.write("\n") + else: + print("\n".join(lines)) + + +def run(command: list[str], **kwargs) -> subprocess.CompletedProcess: + print("+", " ".join(command)) + return subprocess.run(command, check=False, text=True, **kwargs) + + +def nvcc_path(root: Path) -> Path: + executable = "nvcc.exe" if platform.system() == "Windows" else "nvcc" + return root / "bin" / executable + + +def verify_toolkit(root: Path, version: str) -> bool: + missing = [str(nvcc_path(root))] + missing.extend(str(root / "include" / header) for header in REQUIRED_HEADERS) + missing = [path for path in missing if not Path(path).exists()] + if missing: + print(f"CUDA Toolkit is incomplete at {root}.") + for path in missing: + print(f"missing: {path}") + return False + + result = run([str(nvcc_path(root)), "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + print(result.stdout) + expected = f"release {major_minor(version)}" + if result.returncode != 0 or expected not in result.stdout: + print(f"Expected CUDA Toolkit {major_minor(version)} at {root}.") + return False + return True + + +def verify() -> None: + version = cuda_version() + values = paths() + root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", values["cache_root"])) + if not verify_toolkit(root, version): + raise SystemExit(1) + + +def install() -> None: + if platform.system() != "Windows": + raise SystemExit("CUDA Toolkit install is only implemented for Windows CI.") + + version = cuda_version() + values = paths() + install_root = windows_install_root(version) + cache_root = Path(values["cache_root"]) + + if verify_toolkit(cache_root, version): + print(f"CUDA Toolkit {major_minor(version)} already restored at {cache_root}") + return + + result = run(["choco", "install", "cuda", "--version", version, "--yes", "--no-progress"]) + if result.returncode != 0: + raise SystemExit(f"CUDA Toolkit {version} installation failed.") + + if not verify_toolkit(install_root, version): + raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {install_root} after installation.") + + cache_root.mkdir(parents=True, exist_ok=True) + result = run(["robocopy", str(install_root), str(cache_root), "/MIR", "/R:2", "/W:2", "/NFL", "/NDL", "/NP"]) + if result.returncode > 7: + raise SystemExit(f"Failed to mirror CUDA Toolkit into cache root. robocopy exit code: {result.returncode}") + + if not verify_toolkit(cache_root, version): + raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {cache_root} after installation.") + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("command", choices=("outputs", "install", "verify")) + args = parser.parse_args() + + if args.command == "outputs": + emit_outputs() + elif args.command == "install": + install() + elif args.command == "verify": + verify() + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 7e89d68e15..e151c291e0 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -9,6 +9,9 @@ permissions: contents: read actions: read +env: + CUDA_VERSION: '13.2.1' + concurrency: group: push-lock-${{ github.ref }} cancel-in-progress: true @@ -48,88 +51,34 @@ jobs: & $rgExe --version prepare-host-cuda: - name: Prepare host CUDA 13.2 + name: Prepare host CUDA runs-on: windows-2025 - env: - cudaVersion: '13.2.1' - cudaMajorMinor: '13.2' - cudaInstallRoot: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.2 - cudaCacheRoot: C:\nabla-ci\cuda\v13.2 - steps: + - name: Checkout CUDA CI helper + uses: actions/checkout@v6 + with: + fetch-depth: 1 + sparse-checkout: | + .github/scripts + + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs + - name: Restore CUDA Toolkit id: cache-cuda uses: actions/cache@v5 with: - path: ${{ env.cudaCacheRoot }} - key: cuda-toolkit-${{ env.cudaVersion }}-windows-2025-x64-choco-v1 + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} - name: Install CUDA Toolkit if: steps.cache-cuda.outputs.cache-hit != 'true' - shell: pwsh - run: | - function Test-CudaToolkit { - param([string]$Root) - - $nvcc = Join-Path $Root 'bin\nvcc.exe' - $cudaH = Join-Path $Root 'include\cuda.h' - $nvrtcH = Join-Path $Root 'include\nvrtc.h' - $fp16H = Join-Path $Root 'include\cuda_fp16.h' - $vectorTypesH = Join-Path $Root 'include\vector_types.h' - if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) { - return $false - } - - $version = & $nvcc --version 2>&1 - return ($LASTEXITCODE -eq 0 -and ($version -match "release $env:cudaMajorMinor")) - } - - if (Test-CudaToolkit $env:cudaCacheRoot) { - Write-Host "CUDA Toolkit $env:cudaMajorMinor already restored at $env:cudaCacheRoot" - return - } - - choco install cuda ` - --version $env:cudaVersion ` - --yes ` - --no-progress - - if ($LASTEXITCODE -ne 0) { - throw "CUDA Toolkit $env:cudaVersion installation failed." - } - - if (-not (Test-CudaToolkit $env:cudaInstallRoot)) { - throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaInstallRoot after installation." - } - - New-Item -ItemType Directory -Force -Path $env:cudaCacheRoot | Out-Null - robocopy $env:cudaInstallRoot $env:cudaCacheRoot /MIR /R:2 /W:2 /NFL /NDL /NP - if ($LASTEXITCODE -gt 7) { - throw "Failed to mirror CUDA Toolkit into cache root. robocopy exit code: $LASTEXITCODE" - } - $global:LASTEXITCODE = 0 - - if (-not (Test-CudaToolkit $env:cudaCacheRoot)) { - throw "CUDA Toolkit $env:cudaMajorMinor was not found at $env:cudaCacheRoot after installation." - } + run: python .github/scripts/ci_cuda_toolkit.py install - name: Verify CUDA Toolkit - shell: pwsh - run: | - $nvcc = Join-Path $env:cudaCacheRoot 'bin\nvcc.exe' - $cudaH = Join-Path $env:cudaCacheRoot 'include\cuda.h' - $nvrtcH = Join-Path $env:cudaCacheRoot 'include\nvrtc.h' - $fp16H = Join-Path $env:cudaCacheRoot 'include\cuda_fp16.h' - $vectorTypesH = Join-Path $env:cudaCacheRoot 'include\vector_types.h' - if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH) -or -not (Test-Path $fp16H) -or -not (Test-Path $vectorTypesH)) { - throw "CUDA Toolkit cache is incomplete at $env:cudaCacheRoot." - } - $version = & $nvcc --version - if ($LASTEXITCODE -ne 0 -or -not ($version -match "release $env:cudaMajorMinor")) { - throw "Expected CUDA Toolkit $env:cudaMajorMinor. nvcc output: $version" - } - $version + run: python .github/scripts/ci_cuda_toolkit.py verify build-windows: name: Nabla (${{ matrix.os }}, ${{ matrix.vendor }}-${{ matrix.tag }}, ${{ matrix.config }}) @@ -143,9 +92,6 @@ jobs: mount: C:\mount\nabla binary: C:\mount\nabla\build-ct install: build-ct\install - cudaHostRoot: C:\nabla-ci\cuda\v13.2 - cudaContainerRoot: C:\cuda\v13.2 - cudaContainerRootCMake: C:/cuda/v13.2 strategy: fail-fast: false @@ -252,6 +198,10 @@ jobs: with: submodules: 'recursive' + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs + - name: Restore ripgrep host tool id: cache-rg uses: actions/cache@v5 @@ -274,23 +224,11 @@ jobs: id: cache-cuda uses: actions/cache@v5 with: - path: ${{ env.cudaHostRoot }} - key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1 + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} - name: Verify CUDA Toolkit - shell: pwsh - run: | - $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe' - $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h' - $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h' - if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { - throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}." - } - $version = & $nvcc --version - if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) { - throw "Expected CUDA Toolkit 13.2. nvcc output: $version" - } - $version + run: python .github/scripts/ci_cuda_toolkit.py verify - name: Pull Image run: | @@ -317,11 +255,11 @@ jobs: --env-file .\docker\ci-windows.env ` --env-file .\docker\ninja.env ` --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" ` - --env "CUDA_PATH=${{ env.cudaContainerRoot }}" ` - --env "CUDA_PATH_V13_2=${{ env.cudaContainerRoot }}" ` + --env "CUDA_PATH=${{ steps.cuda.outputs.container_root }}" ` + --env "${{ steps.cuda.outputs.version_env }}=${{ steps.cuda.outputs.container_root }}" ` --name orphan --network docker_default ` -v "${{ github.workspace }}:${{ env.mount }}" ` - -v "${{ env.cudaHostRoot }}:${{ env.cudaContainerRoot }}" ` + -v "${{ steps.cuda.outputs.cache_root }}:${{ steps.cuda.outputs.container_root }}" ` -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` -w "${{ env.mount }}" ` "${{ env.image }}:${{ matrix.tag }}" ` @@ -343,7 +281,7 @@ jobs: ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install ` - -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.cudaContainerRootCMake }} ` + -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ steps.cuda.outputs.container_root_cmake }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace @@ -745,8 +683,6 @@ jobs: name: Nabla / Smoke (${{ matrix.os }}, ${{ matrix.vendor }}-latest, ${{ matrix.config }}) needs: build-windows runs-on: windows-2022 - env: - cudaHostRoot: C:\nabla-ci\cuda\v13.2 strategy: fail-fast: false matrix: @@ -759,9 +695,14 @@ jobs: with: fetch-depth: 1 sparse-checkout: | + .github/scripts smoke src/nbl/ext/CUDAInterop/smoke + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs + - name: Download VulkanSDK uses: Devsh-Graphics-Programming/install-vulkan-sdk-action@v1.4.0-devsh.1 with: @@ -775,23 +716,11 @@ jobs: id: cache-cuda uses: actions/cache@v5 with: - path: ${{ env.cudaHostRoot }} - key: cuda-toolkit-13.2.1-windows-2025-x64-choco-v1 + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} - name: Verify CUDA Toolkit - shell: pwsh - run: | - $nvcc = Join-Path '${{ env.cudaHostRoot }}' 'bin\nvcc.exe' - $cudaH = Join-Path '${{ env.cudaHostRoot }}' 'include\cuda.h' - $nvrtcH = Join-Path '${{ env.cudaHostRoot }}' 'include\nvrtc.h' - if (-not (Test-Path $nvcc) -or -not (Test-Path $cudaH) -or -not (Test-Path $nvrtcH)) { - throw "CUDA Toolkit 13.2 cache was not restored to ${{ env.cudaHostRoot }}." - } - $version = & $nvcc --version - if ($LASTEXITCODE -ne 0 -or -not ($version -match 'release 13.2')) { - throw "Expected CUDA Toolkit 13.2. nvcc output: $version" - } - $version + run: python .github/scripts/ci_cuda_toolkit.py verify - name: Download Nabla install artifact uses: actions/download-artifact@v8 @@ -824,6 +753,6 @@ jobs: -B smoke/cuda-interop-smoke ` -D "CMAKE_PREFIX_PATH=${{ github.workspace }}\smoke\build-ct\install\cmake" ` -D "NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE=ON" ` - -D "Nabla_CUDA_TOOLKIT_ROOT=${{ env.cudaHostRoot }}" + -D "Nabla_CUDA_TOOLKIT_ROOT=${{ steps.cuda.outputs.cache_root }}" cmake --build smoke/cuda-interop-smoke --config ${{ matrix.config }} From f74efe822c9681eed2a474254a85af6722000660 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 12 May 2026 07:46:21 +0200 Subject: [PATCH 133/149] Simplify CUDA CI cache handling --- .github/scripts/ci_cuda_toolkit.py | 89 +++++++++--------------------- .github/workflows/build-nabla.yml | 15 +++-- 2 files changed, 32 insertions(+), 72 deletions(-) diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py index ee76eaf0b5..04bc1efc91 100644 --- a/.github/scripts/ci_cuda_toolkit.py +++ b/.github/scripts/ci_cuda_toolkit.py @@ -30,40 +30,33 @@ def major_minor(version: str) -> str: return f"{major}.{minor}" -def windows_paths(version: str) -> dict[str, str]: +def cache_root(version: str) -> str: mm = major_minor(version) - major, minor = mm.split(".") - return { - "cache_root": rf"C:\nabla-ci\cuda\v{mm}", - "container_root": rf"C:\cuda\v{mm}", - "container_root_cmake": f"C:/cuda/v{mm}", - "version_env": f"CUDA_PATH_V{major}_{minor}", - "cache_key": f"cuda-toolkit-{version}-windows-2025-x64-choco-v1", - } + if platform.system() == "Windows": + return rf"C:\nabla-ci\cuda\v{mm}" + return f"/opt/nabla-ci/cuda/v{mm}" -def windows_install_root(version: str) -> Path: - return Path(rf"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v{major_minor(version)}") +def cache_key(version: str) -> str: + if platform.system() == "Windows": + return f"cuda-toolkit-{version}-windows-2025-x64-v2" + return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1" -def paths() -> dict[str, str]: - version = cuda_version() +def cache_restore_key(version: str) -> str: if platform.system() == "Windows": - return windows_paths(version) - mm = major_minor(version) - return { - "cache_root": f"/opt/nabla-ci/cuda/v{mm}", - "container_root": f"/cuda/v{mm}", - "container_root_cmake": f"/cuda/v{mm}", - "version_env": f"CUDA_PATH_V{mm.replace('.', '_')}", - "cache_key": f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1", - } + return f"cuda-toolkit-{version}-windows-2025-x64-" + return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-" def emit_outputs() -> None: + version = cuda_version() + lines = ( + f"cache_root={cache_root(version)}", + f"cache_key={cache_key(version)}", + f"cache_restore_key={cache_restore_key(version)}", + ) output = os.environ.get("GITHUB_OUTPUT") - values = paths() - lines = [f"{key}={value}" for key, value in values.items()] if output: with open(output, "a", encoding="utf-8") as file: file.write("\n".join(lines)) @@ -72,22 +65,22 @@ def emit_outputs() -> None: print("\n".join(lines)) -def run(command: list[str], **kwargs) -> subprocess.CompletedProcess: - print("+", " ".join(command)) - return subprocess.run(command, check=False, text=True, **kwargs) - - def nvcc_path(root: Path) -> Path: executable = "nvcc.exe" if platform.system() == "Windows" else "nvcc" return root / "bin" / executable +def run(command: list[str], **kwargs) -> subprocess.CompletedProcess: + print("+", " ".join(command)) + return subprocess.run(command, check=False, text=True, **kwargs) + + def verify_toolkit(root: Path, version: str) -> bool: missing = [str(nvcc_path(root))] missing.extend(str(root / "include" / header) for header in REQUIRED_HEADERS) missing = [path for path in missing if not Path(path).exists()] if missing: - print(f"CUDA Toolkit is incomplete at {root}.") + print(f"CUDA Toolkit cache is incomplete at {root}.") for path in missing: print(f"missing: {path}") return False @@ -103,50 +96,18 @@ def verify_toolkit(root: Path, version: str) -> bool: def verify() -> None: version = cuda_version() - values = paths() - root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", values["cache_root"])) + root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", cache_root(version))) if not verify_toolkit(root, version): raise SystemExit(1) -def install() -> None: - if platform.system() != "Windows": - raise SystemExit("CUDA Toolkit install is only implemented for Windows CI.") - - version = cuda_version() - values = paths() - install_root = windows_install_root(version) - cache_root = Path(values["cache_root"]) - - if verify_toolkit(cache_root, version): - print(f"CUDA Toolkit {major_minor(version)} already restored at {cache_root}") - return - - result = run(["choco", "install", "cuda", "--version", version, "--yes", "--no-progress"]) - if result.returncode != 0: - raise SystemExit(f"CUDA Toolkit {version} installation failed.") - - if not verify_toolkit(install_root, version): - raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {install_root} after installation.") - - cache_root.mkdir(parents=True, exist_ok=True) - result = run(["robocopy", str(install_root), str(cache_root), "/MIR", "/R:2", "/W:2", "/NFL", "/NDL", "/NP"]) - if result.returncode > 7: - raise SystemExit(f"Failed to mirror CUDA Toolkit into cache root. robocopy exit code: {result.returncode}") - - if not verify_toolkit(cache_root, version): - raise SystemExit(f"CUDA Toolkit {major_minor(version)} was not found at {cache_root} after installation.") - - def main() -> None: parser = argparse.ArgumentParser() - parser.add_argument("command", choices=("outputs", "install", "verify")) + parser.add_argument("command", choices=("outputs", "verify")) args = parser.parse_args() if args.command == "outputs": emit_outputs() - elif args.command == "install": - install() elif args.command == "verify": verify() diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index e151c291e0..e61debb623 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -11,6 +11,7 @@ permissions: env: CUDA_VERSION: '13.2.1' + CUDA_CONTAINER_ROOT: C:\cuda concurrency: group: push-lock-${{ github.ref }} @@ -72,10 +73,7 @@ jobs: with: path: ${{ steps.cuda.outputs.cache_root }} key: ${{ steps.cuda.outputs.cache_key }} - - - name: Install CUDA Toolkit - if: steps.cache-cuda.outputs.cache-hit != 'true' - run: python .github/scripts/ci_cuda_toolkit.py install + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} - name: Verify CUDA Toolkit run: python .github/scripts/ci_cuda_toolkit.py verify @@ -226,6 +224,7 @@ jobs: with: path: ${{ steps.cuda.outputs.cache_root }} key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} - name: Verify CUDA Toolkit run: python .github/scripts/ci_cuda_toolkit.py verify @@ -255,11 +254,10 @@ jobs: --env-file .\docker\ci-windows.env ` --env-file .\docker\ninja.env ` --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" ` - --env "CUDA_PATH=${{ steps.cuda.outputs.container_root }}" ` - --env "${{ steps.cuda.outputs.version_env }}=${{ steps.cuda.outputs.container_root }}" ` + --env "CUDA_PATH=${{ env.CUDA_CONTAINER_ROOT }}" ` --name orphan --network docker_default ` -v "${{ github.workspace }}:${{ env.mount }}" ` - -v "${{ steps.cuda.outputs.cache_root }}:${{ steps.cuda.outputs.container_root }}" ` + -v "${{ steps.cuda.outputs.cache_root }}:${{ env.CUDA_CONTAINER_ROOT }}" ` -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` -w "${{ env.mount }}" ` "${{ env.image }}:${{ matrix.tag }}" ` @@ -281,7 +279,7 @@ jobs: ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install ` - -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ steps.cuda.outputs.container_root_cmake }} ` + -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.CUDA_CONTAINER_ROOT }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace @@ -718,6 +716,7 @@ jobs: with: path: ${{ steps.cuda.outputs.cache_root }} key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} - name: Verify CUDA Toolkit run: python .github/scripts/ci_cuda_toolkit.py verify From 920f2ef7b8f07a5d9dcf500e500ccd09500b9987 Mon Sep 17 00:00:00 2001 From: Arkadiusz Lachowicz Date: Tue, 12 May 2026 08:16:28 +0200 Subject: [PATCH 134/149] Keep CUDA CI paths configurable --- .github/scripts/ci_cuda_toolkit.py | 12 ++++++++---- .github/workflows/build-nabla.yml | 3 ++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py index 04bc1efc91..975dc7101d 100644 --- a/.github/scripts/ci_cuda_toolkit.py +++ b/.github/scripts/ci_cuda_toolkit.py @@ -30,11 +30,15 @@ def major_minor(version: str) -> str: return f"{major}.{minor}" +def cache_base() -> Path: + base = os.environ.get("CUDA_CACHE_BASE", "").strip() + if not base: + raise SystemExit("CUDA_CACHE_BASE is not set.") + return Path(base) + + def cache_root(version: str) -> str: - mm = major_minor(version) - if platform.system() == "Windows": - return rf"C:\nabla-ci\cuda\v{mm}" - return f"/opt/nabla-ci/cuda/v{mm}" + return str(cache_base() / f"v{major_minor(version)}") def cache_key(version: str) -> str: diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index e61debb623..d6d593ebc5 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -11,7 +11,8 @@ permissions: env: CUDA_VERSION: '13.2.1' - CUDA_CONTAINER_ROOT: C:\cuda + CUDA_CACHE_BASE: 'C:\nabla-ci\cuda' + CUDA_CONTAINER_ROOT: 'C:\cuda' concurrency: group: push-lock-${{ github.ref }} From 2744182a16e4e021b84db22577e9784689a7d2d4 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 17:06:58 +0700 Subject: [PATCH 135/149] Temporarily add include path --- src/nbl/video/CCUDADevice.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 27f8f6f906..52659f6903 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -32,6 +32,7 @@ CCUDADevice::CCUDADevice( m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); + m_defaultCompileOptions.push_back("-IC:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include/"); const auto& cu = m_handler->getCUDAFunctionTable(); From f18a7c6b08d093f0bf62c4c57d07cb6ff09bd118 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 21:43:00 +0700 Subject: [PATCH 136/149] Add final keyword whenever appropriate --- include/nbl/video/CCUDAExportableMemory.h | 2 +- include/nbl/video/CCUDAImportedMemory.h | 2 +- include/nbl/video/CCUDAImportedSemaphore.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index f1ae7f6031..cc815535c1 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -14,7 +14,7 @@ namespace nbl::video { class CCUDADevice; -class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted +class NBL_API2 CCUDAExportableMemory final : public core::IReferenceCounted { public: struct SCachedCreationParams diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 0266706480..7dde4908af 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -11,7 +11,7 @@ namespace nbl::video class CCUDADevice; -class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted +class NBL_API2 CCUDAImportedMemory final : public core::IReferenceCounted { public: ~CCUDAImportedMemory() override; diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index 7f2b266383..204e1b79f3 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -15,7 +15,7 @@ namespace nbl::video class CCUDADevice; -class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted +class NBL_API2 CCUDAImportedSemaphore final : public core::IReferenceCounted { public: ~CCUDAImportedSemaphore() override; From 5ed47a8ca35e175482a01b3fa15018a35f358d24 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 22:08:38 +0700 Subject: [PATCH 137/149] Add documentation for CCUDAExportableMemory::exportAsMemory --- include/nbl/video/CCUDAExportableMemory.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index cc815535c1..6736b55d41 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -27,6 +27,20 @@ class NBL_API2 CCUDAExportableMemory final : public core::IReferenceCounted ~CCUDAExportableMemory() override; cuda_interop::SCUdeviceptr getDeviceptr() const; + + /** + * @brief Exports the CUDA memory as a Vulkan device memory allocation. + * + * Creates an IDeviceMemoryAllocation object that references the underlying CUDA memory, + * allowing it to be used within the Vulkan rendering pipeline while maintaining + * interoperability with CUDA operations. + * + * @param device The logical device that will own the exported memory allocation. + * @param dedication Optional pointer to a device memory backed resource for dedicated allocation. + * If provided, the memory will be dedicated to that specific resource and + * automatically bound to it. + * @return A smart pointer to the exported IDeviceMemoryAllocation, or nullptr on failure. + */ core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; private: From 6af5ab8d177e01e1e5f84dcf8c10d2467fe377dd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 22:29:56 +0700 Subject: [PATCH 138/149] Remove external_handle from memory type iterator --- include/nbl/video/IDeviceMemoryAllocator.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 019fbd9358..7ab72552bf 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -45,12 +45,10 @@ class NBL_API2 IDeviceMemoryAllocator public: IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags, - IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, - external_handle_t handle) : + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs), - m_handleType(handleType), - m_handle(handle) + m_handleType(handleType) {} static inline uint32_t end() {return 32u;} @@ -61,7 +59,7 @@ class NBL_API2 IDeviceMemoryAllocator return *this; } - inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) + inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication, external_handle_t external_handle) { SAllocateInfo ret; ret.allocationSize = m_reqs.size; @@ -69,7 +67,7 @@ class NBL_API2 IDeviceMemoryAllocator ret.memoryTypeIndex = dereference(); ret.dedication = dedication; ret.externalHandleType = m_handleType; - ret.externalHandle = m_handle; + ret.externalHandle = external_handle; return ret; } @@ -83,7 +81,6 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; - external_handle_t m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB @@ -93,10 +90,9 @@ class NBL_API2 IDeviceMemoryAllocator DefaultMemoryTypeIterator( const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags, - IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, - external_handle_t handle + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType ) : - IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) + IMemoryTypeIterator(reqs, allocateFlags, handleType) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } @@ -127,9 +123,9 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, external_handle_t externalHandle = {}) { - for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { - SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); + SAllocateInfo allocateInfo = memTypeIt.operator()(dedication, externalHandle); auto allocation = allocate(allocateInfo); if (allocation.isValid()) return allocation; From 874177db249030b91fc0c23cb0f07ff6428f6ea1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 22:46:24 +0700 Subject: [PATCH 139/149] Rename externalHandle to either exportHandle or importHandle --- include/nbl/video/IDeviceMemoryAllocation.h | 9 +++++---- include/nbl/video/IDeviceMemoryAllocator.h | 2 +- include/nbl/video/ISemaphore.h | 2 +- src/nbl/video/CCUDADevice.cpp | 6 +++--- src/nbl/video/CVulkanLogicalDevice.cpp | 8 ++++---- src/nbl/video/CVulkanMemoryAllocation.h | 10 ++++++++-- src/nbl/video/CVulkanSemaphore.h | 6 +++++- 7 files changed, 27 insertions(+), 16 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 6120574baa..05068631db 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -175,9 +175,10 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; // Handle Type for external resources IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; - //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE - //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE - external_handle_t externalHandle = 0; + //! Imports the given handle if importHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if importHandle == nullptr && externalHandleType != EHT_NONE + // Note:: Closing importHandle is not the responsibility of this class + external_handle_t importHandle = 0; }; struct SCreationParams: SInfo @@ -188,7 +189,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted inline const SCreationParams& getCreationParams() const { return m_params; } - virtual external_handle_t getExternalHandle() const = 0; + virtual external_handle_t getExportHandle() const = 0; protected: diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 7ab72552bf..e108b5ee28 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -67,7 +67,7 @@ class NBL_API2 IDeviceMemoryAllocator ret.memoryTypeIndex = dereference(); ret.dedication = dedication; ret.externalHandleType = m_handleType; - ret.externalHandle = external_handle; + ret.importHandle = external_handle; return ret; } diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 0edc906b5d..f288dad182 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -167,7 +167,7 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; - virtual external_handle_t getExternalHandle() const = 0; + virtual external_handle_t getExportHandle() const = 0; const SCachedCreationParams& getCreationParams() const { return m_creationParams; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index b50c929cb6..0d95be6d9c 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -239,7 +239,7 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co if (!handleType) return nullptr; - const auto externalHandle = mem->getExternalHandle(); + const auto externalHandle = mem->getExportHandle(); CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; #ifdef _WIN32 @@ -274,10 +274,10 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { #ifdef _WIN32 .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, - .handle = {.win32 = {.handle = sema->getExternalHandle() }}, + .handle = {.win32 = {.handle = sema->getExportHandle() }}, #else .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, - .handle = {.fd = sema->getExternalHandle()} + .handle = {.fd = sema->getExportHandle()} #endif }; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 5cc0dbd8f3..20dd61ee10 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -204,7 +204,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca VkImportMemoryFdInfoKHR importInfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, .handleType = static_cast(info.externalHandleType), - .fd = info.externalHandle, + .fd = info.importHandle, }; #endif @@ -221,9 +221,9 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca external_handle_t externalHandle = ExternalHandleNull; if (info.externalHandleType) { - if (info.externalHandle) //importing + if (info.importHandle) //importing { - externalHandle = DuplicateExternalHandle(info.externalHandle); + externalHandle = DuplicateExternalHandle(info.importHandle); #ifdef _WIN32 importInfo.handle = externalHandle; #else @@ -266,7 +266,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca if (vk_res!=VK_SUCCESS) return {}; - const bool exported = info.externalHandleType && !info.externalHandle; + const bool exported = info.externalHandleType && !info.importHandle; if (exported) { diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 473d826595..23d26aaf89 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -23,9 +23,12 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } - inline external_handle_t getExternalHandle() const override + inline external_handle_t getExportHandle() const override { - return m_externalHandle; + // Do not return duplicated importHandle + if (m_params.importHandle == nullptr) + return m_externalHandle; + return nullptr; } private: @@ -36,6 +39,9 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation core::smart_refctd_ptr m_vulkanDevice; const VkDeviceMemory m_deviceMemoryHandle; + + // Can store either duplicated importHandle or exportHandle. + // This handle will be closed when destructor is called, unlike importHandle in SCreationParams. const external_handle_t m_externalHandle; }; diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 12ba147a24..136fa59f51 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -24,12 +24,16 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} - external_handle_t getExternalHandle() const override { return m_externalHandle; } + external_handle_t getExportHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; private: const VkSemaphore m_semaphore; + + // Can store either duplicated importHandle or exportHandle. + // This handle will be closed when destructor is called, unlike importHandle in SCreationParams. + // For now, it only store exportHandle, since we haven't support importing external semaphore yet const external_handle_t m_externalHandle; }; From b3b3c778a8e0f607b8aacf467a4b014f7f8eba6f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 23:14:20 +0700 Subject: [PATCH 140/149] Remove redundant comment --- src/nbl/video/CVulkanSemaphore.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 136fa59f51..d6a8805a6a 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -32,7 +32,6 @@ class CVulkanSemaphore final : public ISemaphore const VkSemaphore m_semaphore; // Can store either duplicated importHandle or exportHandle. - // This handle will be closed when destructor is called, unlike importHandle in SCreationParams. // For now, it only store exportHandle, since we haven't support importing external semaphore yet const external_handle_t m_externalHandle; }; From 85fcc1a1821818a18987b2d359e180ae7ce3a59d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 23:16:07 +0700 Subject: [PATCH 141/149] Put allocate arguments into SAllocateParams --- include/nbl/video/IDeviceMemoryAllocator.h | 19 ++++++++++++------- include/nbl/video/utilities/IUtilities.h | 4 ++-- src/nbl/video/CCUDAExportableMemory.cpp | 10 ++++++---- src/nbl/video/utilities/CAssetConverter.cpp | 2 +- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index e108b5ee28..0fac56cf19 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -17,9 +17,16 @@ class NBL_API2 IDeviceMemoryAllocator struct SAllocateInfo : IDeviceMemoryAllocation::SInfo { - size_t memoryTypeIndex = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. + uint8_t memoryTypeIndex = 0u; + }; + + struct SAllocateParams { + IDeviceMemoryBacked* dedication = nullptr; + const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + external_handle_t externalHandle = ExternalHandleNull; }; struct SAllocation @@ -116,16 +123,14 @@ class NBL_API2 IDeviceMemoryAllocator }; template + // TODO(kevinyu) : Fix all example_tests if this api change to use SAllocateParams is approved inline SAllocation allocate( const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, - IDeviceMemoryBacked* dedication = nullptr, - const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, - IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, - external_handle_t externalHandle = {}) + const SAllocateParams& params) { - for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, params.allocateFlags, params.externalHandleType); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { - SAllocateInfo allocateInfo = memTypeIt.operator()(dedication, externalHandle); + SAllocateInfo allocateInfo = memTypeIt.operator()(params.dedication, params.externalHandle); auto allocation = allocate(allocateInfo); if (allocation.isValid()) return allocation; diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index f52d5d36ef..7ab885572a 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -108,7 +108,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted auto reqs = buffer->getMemoryReqs(); reqs.memoryTypeBits &= physicalDevice->getDownStreamingMemoryTypeBits(); - auto deviceMemAllocation = device->allocate(reqs, buffer.get(), allocateFlags); + auto deviceMemAllocation = device->allocate(reqs, { buffer.get(), allocateFlags }); if (!deviceMemAllocation.isValid()) { @@ -143,7 +143,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted auto reqs = buffer->getMemoryReqs(); reqs.memoryTypeBits &= physicalDevice->getUpStreamingMemoryTypeBits(); - auto deviceMemAllocation = device->allocate(reqs, buffer.get(), allocateFlags); + auto deviceMemAllocation = device->allocate(reqs, { buffer.get(), allocateFlags }); if (!deviceMemAllocation.isValid()) { diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 9333a39f54..9eb0313ffd 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -44,10 +44,12 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM req.requiresDedicatedAllocation = nullptr != dedication; return device->allocate(req, - dedication, - IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, - CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - m_params.externalHandle).memory; + { + dedication, + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + m_params.externalHandle + }).memory; } CCUDAExportableMemory::~CCUDAExportableMemory() diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d397cc4567..16b2851ad6 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2320,7 +2320,7 @@ class MetaDeviceMemoryAllocator final if (memReqs.requiresDedicatedAllocation) { // allocate and bind right away - auto allocation = m_allocator->allocate(memReqs,gpuObj); + auto allocation = m_allocator->allocate(memReqs, { gpuObj }); if (!allocation.isValid()) { m_logger.log("Failed to allocate and bind dedicated memory for %s",system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName()); From 1911eb08aa9d8191c0f91bbfa2a0076419e34a67 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 12 May 2026 23:52:27 +0700 Subject: [PATCH 142/149] Move external_handle_t to its own file and to system namespace --- include/nbl/system/ExternalHandle.h | 56 +++++++++++++++++++++ include/nbl/system/declarations.h | 3 ++ include/nbl/video/CCUDAExportableMemory.h | 2 +- include/nbl/video/EApiType.h | 47 ----------------- include/nbl/video/IDeviceMemoryAllocation.h | 8 ++- include/nbl/video/IDeviceMemoryAllocator.h | 4 +- include/nbl/video/ISemaphore.h | 2 +- src/nbl/video/CCUDADevice.cpp | 4 +- src/nbl/video/CCUDAExportableMemory.cpp | 2 +- src/nbl/video/CVulkanLogicalDevice.cpp | 6 +-- src/nbl/video/CVulkanMemoryAllocation.cpp | 6 +-- src/nbl/video/CVulkanMemoryAllocation.h | 6 +-- src/nbl/video/CVulkanSemaphore.cpp | 2 +- src/nbl/video/CVulkanSemaphore.h | 6 +-- 14 files changed, 85 insertions(+), 69 deletions(-) create mode 100644 include/nbl/system/ExternalHandle.h diff --git a/include/nbl/system/ExternalHandle.h b/include/nbl/system/ExternalHandle.h new file mode 100644 index 0000000000..baac27a0b4 --- /dev/null +++ b/include/nbl/system/ExternalHandle.h @@ -0,0 +1,56 @@ +#ifndef __NBL_EXTERNAL_HANDLE_INCLUDED__ +#define __NBL_EXTERNAL_HANDLE_INCLUDED__ + +#ifdef _WIN32 + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN + #endif + #include +#else + #include +#endif + +namespace nbl::system +{ + +using external_handle_t = +#ifdef _WIN32 + void* +#else + int +#endif + ; + +#ifdef _WIN32 +constexpr external_handle_t ExternalHandleNull = nullptr; +#else +constexpr external_handle_t ExternalHandleNull = -1; +#endif + +inline bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return close(handle) == 0; +#endif +} + +inline external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE duplicated = ExternalHandleNull; + + const HANDLE process = GetCurrentProcess(); + if (!DuplicateHandle(process, handle, process, &duplicated, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return duplicated; +#else + return dup(handle); +#endif +} + +} + +#endif diff --git a/include/nbl/system/declarations.h b/include/nbl/system/declarations.h index fa3dc2c6da..de632fb110 100644 --- a/include/nbl/system/declarations.h +++ b/include/nbl/system/declarations.h @@ -41,4 +41,7 @@ // frameworks (ugh, doesn't work!) //#include "nbl/system/IApplicationFramework.h" +// Handle for import and export gpu resource +#include "nbl/system/ExternalHandle.h" + #endif \ No newline at end of file diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 6736b55d41..126be44f37 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -20,7 +20,7 @@ class NBL_API2 CCUDAExportableMemory final : public core::IReferenceCounted struct SCachedCreationParams { size_t granularSize; - external_handle_t externalHandle; + system::external_handle_t externalHandle; bool deviceLocal; }; diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 89be885b0f..cee9d3c081 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -2,14 +2,6 @@ #define __NBL_E_API_TYPE_H_INCLUDED__ #include -#ifdef _WIN32 - #ifndef WIN32_LEAN_AND_MEAN - #define WIN32_LEAN_AND_MEAN - #endif - #include -#else - #include -#endif namespace nbl::video { @@ -20,45 +12,6 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; -// TODO(kevinyu): Should I move this type and functions to its own file? -using external_handle_t = -#ifdef _WIN32 -void* -#else -int -#endif -; - -#ifdef _WIN32 -constexpr external_handle_t ExternalHandleNull = nullptr; -#else -constexpr external_handle_t ExternalHandleNull = -1; -#endif - -inline bool CloseExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - return CloseHandle(handle); -#else - return close(handle)==0; -#endif -} - -inline external_handle_t DuplicateExternalHandle(external_handle_t handle) -{ -#ifdef _WIN32 - HANDLE duplicated = ExternalHandleNull; - - const HANDLE process = GetCurrentProcess(); - if (!DuplicateHandle(process,handle,process,&duplicated,GENERIC_ALL,0,DUPLICATE_SAME_ACCESS)) - return ExternalHandleNull; - - return duplicated; -#else - return dup(handle); -#endif -} - } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 05068631db..0da4843d51 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -91,15 +91,19 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted E_API_TYPE getAPIType() const; //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource. + [[deprecated]] inline bool isDedicated() const {return m_params.dedicated;} //! Returns the size of the memory allocation + [[deprecated]] inline size_t getAllocationSize() const {return m_params.allocationSize;} //! + [[deprecated]] inline core::bitflag getAllocateFlags() const { return m_params.allocateFlags; } //! + [[deprecated]] inline core::bitflag getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr) @@ -178,7 +182,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Imports the given handle if importHandle != nullptr && externalHandleType != EHT_NONE //! Creates exportable memory if importHandle == nullptr && externalHandleType != EHT_NONE // Note:: Closing importHandle is not the responsibility of this class - external_handle_t importHandle = 0; + system::external_handle_t importHandle = 0; }; struct SCreationParams: SInfo @@ -189,7 +193,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted inline const SCreationParams& getCreationParams() const { return m_params; } - virtual external_handle_t getExportHandle() const = 0; + virtual system::external_handle_t getExportHandle() const = 0; protected: diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 0fac56cf19..94e112a76a 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -26,7 +26,7 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked* dedication = nullptr; const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE; IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; - external_handle_t externalHandle = ExternalHandleNull; + system::external_handle_t externalHandle = system::ExternalHandleNull; }; struct SAllocation @@ -66,7 +66,7 @@ class NBL_API2 IDeviceMemoryAllocator return *this; } - inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication, external_handle_t external_handle) + inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication, system::external_handle_t external_handle) { SAllocateInfo ret; ret.allocationSize = m_reqs.size; diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index f288dad182..0d31ba0c60 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -167,7 +167,7 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; - virtual external_handle_t getExportHandle() const = 0; + virtual system::external_handle_t getExportHandle() const = 0; const SCachedCreationParams& getCreationParams() const { return m_creationParams; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 0d95be6d9c..edfd36844c 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -213,7 +213,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor handler->defaultHandleResult(cu.pcuMemRelease(mem)); - if (!CloseExternalHandle(params.externalHandle)) + if (!system::CloseExternalHandle(params.externalHandle)) m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; @@ -224,7 +224,7 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor handler->defaultHandleResult(err); handler->defaultHandleResult(cu.pcuMemUnmap(nativeState->ptr, params.granularSize)); handler->defaultHandleResult(cu.pcuMemAddressFree(nativeState->ptr, params.granularSize)); - if (!CloseExternalHandle(params.externalHandle)) + if (!system::CloseExternalHandle(params.externalHandle)) m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); return nullptr; } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index 9eb0313ffd..58152c2fcd 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -60,7 +60,7 @@ CCUDAExportableMemory::~CCUDAExportableMemory() m_device->getHandler()->defaultHandleResult(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)); - if (!CloseExternalHandle(m_params.externalHandle)) + if (!system::CloseExternalHandle(m_params.externalHandle)) m_device->getHandler()->getLogger().log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 20dd61ee10..b577d6fc81 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -80,7 +80,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; - external_handle_t externalHandle = external_handle_t{}; + system::external_handle_t externalHandle = system::ExternalHandleNull; const auto handleType = static_cast(creationParams.externalHandleTypes.value); if (handleType != 0) { @@ -218,12 +218,12 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca const void** pNext = &vk_allocateFlagsInfo.pNext; - external_handle_t externalHandle = ExternalHandleNull; + system::external_handle_t externalHandle = system::ExternalHandleNull; if (info.externalHandleType) { if (info.importHandle) //importing { - externalHandle = DuplicateExternalHandle(info.importHandle); + externalHandle = system::DuplicateExternalHandle(info.importHandle); #ifdef _WIN32 importInfo.handle = externalHandle; #else diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 0ec6fc351d..98227c161b 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -6,15 +6,15 @@ namespace nbl::video CVulkanMemoryAllocation::CVulkanMemoryAllocation( const CVulkanLogicalDevice* dev, const VkDeviceMemory deviceMemoryHandle, - const external_handle_t externalHandle, + const system::external_handle_t externalHandle, SCreationParams&& params ) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle), m_externalHandle(externalHandle) {} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { - if (m_externalHandle != ExternalHandleNull) + if (m_externalHandle != system::ExternalHandleNull) { - bool re = CloseExternalHandle(m_externalHandle); + bool re = system::CloseExternalHandle(m_externalHandle); assert(re); } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 23d26aaf89..5833384f9b 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -17,13 +17,13 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation CVulkanMemoryAllocation( const CVulkanLogicalDevice* dev, const VkDeviceMemory deviceMemoryHandle, - const external_handle_t externalHandle, + const system::external_handle_t externalHandle, SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } - inline external_handle_t getExportHandle() const override + inline system::external_handle_t getExportHandle() const override { // Do not return duplicated importHandle if (m_params.importHandle == nullptr) @@ -42,7 +42,7 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation // Can store either duplicated importHandle or exportHandle. // This handle will be closed when destructor is called, unlike importHandle in SCreationParams. - const external_handle_t m_externalHandle; + const system::external_handle_t m_externalHandle; }; } diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 35aefa6ebd..68e6f42ccb 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -12,7 +12,7 @@ CVulkanSemaphore::~CVulkanSemaphore() vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); if (m_creationParams.externalHandleTypes != EHT_NONE) { - CloseExternalHandle(m_externalHandle); + system::CloseExternalHandle(m_externalHandle); } } diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index d6a8805a6a..6a5b66b9ac 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,7 +15,7 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const external_handle_t externalHandle) + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const system::external_handle_t externalHandle) : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore), m_externalHandle(externalHandle) {} ~CVulkanSemaphore(); @@ -24,7 +24,7 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} - external_handle_t getExportHandle() const override { return m_externalHandle; } + system::external_handle_t getExportHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; @@ -33,7 +33,7 @@ class CVulkanSemaphore final : public ISemaphore // Can store either duplicated importHandle or exportHandle. // For now, it only store exportHandle, since we haven't support importing external semaphore yet - const external_handle_t m_externalHandle; + const system::external_handle_t m_externalHandle; }; } From 30e8e3fb27df53e597e403e79c3b060364eea4d2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 13 May 2026 11:39:00 +0700 Subject: [PATCH 143/149] Make some enum flag more compact --- include/nbl/video/IDeviceMemoryAllocation.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 0da4843d51..2a9ee332fa 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -63,7 +63,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted //EMPF_RDMA_CAPABLE_BIT_NV = 0x00000200, }; // - enum E_MEMORY_HEAP_FLAGS : uint32_t + enum E_MEMORY_HEAP_FLAGS : uint8_t { EMHF_NONE = 0, EMHF_DEVICE_LOCAL_BIT = 0x00000001, @@ -71,7 +71,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted }; //! Flags for imported/exported allocation - enum E_EXTERNAL_HANDLE_TYPE : uint32_t + enum E_EXTERNAL_HANDLE_TYPE : uint16_t { EHT_NONE = 0, EHT_OPAQUE_FD = 0x00000001, From d872e3a5eac8c4d5f8fb8983e9b8a671af9513e3 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 13 May 2026 13:28:26 +0700 Subject: [PATCH 144/149] Remove unnecessary friendship --- include/nbl/video/IDeviceMemoryAllocation.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 2a9ee332fa..f0066bc0b0 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -24,8 +24,7 @@ We only support persistently mapped buffers with ARB_buffer_storage. Please don't ask us to support Buffer Orphaning. */ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { - friend class IDeviceMemoryAllocator; - friend class ILogicalDevice; + public: //! Access flags for how the application plans to use mapped memory (if any) /** When you create the memory you can allow for it to be mapped (be given a pointer) From fe2d650dddb9223de2a2e12679d4e9678800c859 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 13 May 2026 13:35:43 +0700 Subject: [PATCH 145/149] Remove const specifier on SCreationParams member --- include/nbl/video/IDeviceMemoryAllocation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index f0066bc0b0..37023af573 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -187,7 +187,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted struct SCreationParams: SInfo { core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; - const bool dedicated = false; + bool dedicated = false; }; inline const SCreationParams& getCreationParams() const { return m_params; } From 8ed5f771e445bd9e96165a66dc182413b19c61d1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 13 May 2026 14:20:04 +0700 Subject: [PATCH 146/149] Log failure when closing externalHandle --- src/nbl/video/CVulkanMemoryAllocation.cpp | 5 +++-- src/nbl/video/CVulkanSemaphore.cpp | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 98227c161b..9ae712d258 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -14,8 +14,9 @@ CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { if (m_externalHandle != system::ExternalHandleNull) { - bool re = system::CloseExternalHandle(m_externalHandle); - assert(re); + bool success = system::CloseExternalHandle(m_externalHandle); + if (!success) m_vulkanDevice->getLogger()->log("Failed to close external handle for Vulkan memory allocation", system::ILogger::ELL_ERROR); + assert(success); } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); } diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 68e6f42ccb..eec4853402 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -12,7 +12,9 @@ CVulkanSemaphore::~CVulkanSemaphore() vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); if (m_creationParams.externalHandleTypes != EHT_NONE) { - system::CloseExternalHandle(m_externalHandle); + bool success = system::CloseExternalHandle(m_externalHandle); + if (!success) vulkanDevice->getLogger()->log("Failed to close external handle for Vulkan semaphore", system::ILogger::ELL_ERROR); + assert(success); } } From 0fdaa3cfda01f7110fbcccf820c14125056aac44 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 13 May 2026 14:23:07 +0700 Subject: [PATCH 147/149] Create new api for createSemaphore --- include/nbl/video/ILogicalDevice.h | 4 +++- include/nbl/video/ISemaphore.h | 5 ++++- src/nbl/video/CVulkanLogicalDevice.cpp | 23 +++++++++++++++++++++-- src/nbl/video/CVulkanLogicalDevice.h | 3 ++- 4 files changed, 30 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 3ef6d12b64..5ebf7ccf1a 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -162,7 +162,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe IQueue::RESULT waitIdle(); //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) = 0; + [[deprecated]] + virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) = 0; + virtual core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&& creationParams = {}) = 0; // Waits for max timeout amout of time for the semaphores to reach a specific counter value // DOES NOT implicitly trigger Queue-refcount-resource release because of two reasons: // - the events may trigger loads of resource releases causing extra processing, whereas our `timeout` could be quite small diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 0d31ba0c60..54a92fb257 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -34,7 +34,10 @@ class ISemaphore : public IBackendObject core::bitflag externalHandleTypes = EHT_NONE; }; - struct SCreationParams : SCachedCreationParams {}; + struct SCreationParams : SCachedCreationParams + { + uint64_t initialValue; + }; // basically a pool function virtual uint64_t getCounterValue() const = 0; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index b577d6fc81..18c8edf06a 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -55,8 +55,27 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) +{ + + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; + type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR, or VkSemaphoreTypeCreateInfo + type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; + type.initialValue = initialValue; + + VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,&type }; + createInfo.flags = static_cast(0); // flags must be 0 -core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams) + VkSemaphore semaphore; + if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) + return nullptr; + + ISemaphore::SCreationParams creationParams; + creationParams.initialValue = initialValue; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore, system::ExternalHandleNull); +} + +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(ISemaphore::SCreationParams&& creationParams) { // TODO(kevin) : Handle importing external semaphore into Vulkan @@ -71,7 +90,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; type.pNext = creationParams.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR, or VkSemaphoreTypeCreateInfo type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; - type.initialValue = initialValue; + type.initialValue = creationParams.initialValue; VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,&type }; createInfo.flags = static_cast(0); // flags must be 0 diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 09213f28db..0f2d7f160f 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -53,7 +53,8 @@ class CVulkanLogicalDevice final : public ILogicalDevice CVulkanLogicalDevice(core::smart_refctd_ptr&& api, renderdoc_api_t* const rdoc, const IPhysicalDevice* const physicalDevice, const VkDevice vkdev, const SCreationParams& params); // sync stuff - core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) override; + core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) override; + core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&& creationParams = {}) override; ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; From 4e1a6975dbb6b15f8424942f47006df63c3604f1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 13 May 2026 14:30:23 +0700 Subject: [PATCH 148/149] Slight refactor --- src/nbl/video/CVulkanMemoryAllocation.cpp | 2 +- src/nbl/video/CVulkanSemaphore.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 9ae712d258..dd2df9ea29 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -14,7 +14,7 @@ CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { if (m_externalHandle != system::ExternalHandleNull) { - bool success = system::CloseExternalHandle(m_externalHandle); + const auto success = system::CloseExternalHandle(m_externalHandle); if (!success) m_vulkanDevice->getLogger()->log("Failed to close external handle for Vulkan memory allocation", system::ILogger::ELL_ERROR); assert(success); } diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index eec4853402..24c11e01fa 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -12,7 +12,7 @@ CVulkanSemaphore::~CVulkanSemaphore() vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); if (m_creationParams.externalHandleTypes != EHT_NONE) { - bool success = system::CloseExternalHandle(m_externalHandle); + const auto success = system::CloseExternalHandle(m_externalHandle); if (!success) vulkanDevice->getLogger()->log("Failed to close external handle for Vulkan semaphore", system::ILogger::ELL_ERROR); assert(success); } From f7de243026760a674a5657649aa1689f4e4af12e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 13 May 2026 15:26:17 +0700 Subject: [PATCH 149/149] Add more ExternalHandleType --- include/nbl/video/IDeviceMemoryAllocation.h | 3 +++ include/nbl/video/IPhysicalDevice.h | 7 +++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 37023af573..d18c56cd0b 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -80,7 +80,10 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted EHT_D3D11_TEXTURE_KMT = 0x00000010, EHT_D3D12_HEAP = 0x00000020, EHT_D3D12_RESOURCE = 0x00000040, + EHT_DMA_BUF = 0x00000080, EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, + EHT_SCI_BUF_NV = 0x00002000, + EHT_SCREEN_BUFFER_QNX = 0x00004000, }; // diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index e3cfe15a90..3fdeff0b2c 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -649,13 +649,12 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable struct SExternalMemoryProperties { - IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE exportableTypes : 7; - IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE compatibleTypes : 7; - // TODO(kevin): This should actually be core::bitflag to be semantically correct. What should we do? Should we use bool for each flag instead of enum? + // Need 15 bit to store all possible value of E_EXTERNAL_HANDLE_TYPE. So bitfield will not save any space. + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE exportableTypes; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE compatibleTypes; E_EXTERNAL_MEMORY_FEATURE_FLAGS features : 3; bool operator == (SExternalMemoryProperties const& rhs) const = default; }; - static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); SExternalMemoryProperties getExternalBufferProperties( core::bitflag usages,