diff --git a/.github/scripts/ci_cuda_toolkit.py b/.github/scripts/ci_cuda_toolkit.py new file mode 100644 index 0000000000..975dc7101d --- /dev/null +++ b/.github/scripts/ci_cuda_toolkit.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python3 + +import argparse +import os +import platform +import subprocess +from pathlib import Path + + +REQUIRED_HEADERS = ( + "cuda.h", + "nvrtc.h", + "cuda_fp16.h", + "vector_types.h", +) + + +def cuda_version() -> str: + version = os.environ.get("CUDA_VERSION", "").strip() + if not version: + raise SystemExit("CUDA_VERSION is not set.") + parts = version.split(".") + if len(parts) < 2 or not all(part.isdigit() for part in parts[:2]): + raise SystemExit(f"CUDA_VERSION must start with major.minor, got: {version}") + return version + + +def major_minor(version: str) -> str: + major, minor, *_ = version.split(".") + return f"{major}.{minor}" + + +def cache_base() -> Path: + base = os.environ.get("CUDA_CACHE_BASE", "").strip() + if not base: + raise SystemExit("CUDA_CACHE_BASE is not set.") + return Path(base) + + +def cache_root(version: str) -> str: + return str(cache_base() / f"v{major_minor(version)}") + + +def cache_key(version: str) -> str: + if platform.system() == "Windows": + return f"cuda-toolkit-{version}-windows-2025-x64-v2" + return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-v1" + + +def cache_restore_key(version: str) -> str: + if platform.system() == "Windows": + return f"cuda-toolkit-{version}-windows-2025-x64-" + return f"cuda-toolkit-{version}-{platform.system().lower()}-x64-" + + +def emit_outputs() -> None: + version = cuda_version() + lines = ( + f"cache_root={cache_root(version)}", + f"cache_key={cache_key(version)}", + f"cache_restore_key={cache_restore_key(version)}", + ) + output = os.environ.get("GITHUB_OUTPUT") + if output: + with open(output, "a", encoding="utf-8") as file: + file.write("\n".join(lines)) + file.write("\n") + else: + print("\n".join(lines)) + + +def nvcc_path(root: Path) -> Path: + executable = "nvcc.exe" if platform.system() == "Windows" else "nvcc" + return root / "bin" / executable + + +def run(command: list[str], **kwargs) -> subprocess.CompletedProcess: + print("+", " ".join(command)) + return subprocess.run(command, check=False, text=True, **kwargs) + + +def verify_toolkit(root: Path, version: str) -> bool: + missing = [str(nvcc_path(root))] + missing.extend(str(root / "include" / header) for header in REQUIRED_HEADERS) + missing = [path for path in missing if not Path(path).exists()] + if missing: + print(f"CUDA Toolkit cache is incomplete at {root}.") + for path in missing: + print(f"missing: {path}") + return False + + result = run([str(nvcc_path(root)), "--version"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + print(result.stdout) + expected = f"release {major_minor(version)}" + if result.returncode != 0 or expected not in result.stdout: + print(f"Expected CUDA Toolkit {major_minor(version)} at {root}.") + return False + return True + + +def verify() -> None: + version = cuda_version() + root = Path(os.environ.get("CUDA_TOOLKIT_ROOT", cache_root(version))) + if not verify_toolkit(root, version): + raise SystemExit(1) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("command", choices=("outputs", "verify")) + args = parser.parse_args() + + if args.command == "outputs": + emit_outputs() + elif args.command == "verify": + verify() + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/build-nabla.yml b/.github/workflows/build-nabla.yml index 8a62da4fc7..d6d593ebc5 100644 --- a/.github/workflows/build-nabla.yml +++ b/.github/workflows/build-nabla.yml @@ -9,6 +9,11 @@ permissions: contents: read actions: read +env: + CUDA_VERSION: '13.2.1' + CUDA_CACHE_BASE: 'C:\nabla-ci\cuda' + CUDA_CONTAINER_ROOT: 'C:\cuda' + concurrency: group: push-lock-${{ github.ref }} cancel-in-progress: true @@ -47,9 +52,36 @@ jobs: } & $rgExe --version + prepare-host-cuda: + name: Prepare host CUDA + runs-on: windows-2025 + + steps: + - name: Checkout CUDA CI helper + uses: actions/checkout@v6 + with: + fetch-depth: 1 + sparse-checkout: | + .github/scripts + + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs + + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} + + - name: Verify CUDA Toolkit + run: python .github/scripts/ci_cuda_toolkit.py verify + build-windows: name: Nabla (${{ matrix.os }}, ${{ matrix.vendor }}-${{ matrix.tag }}, ${{ matrix.config }}) - needs: prepare-host-rg + needs: [prepare-host-rg, prepare-host-cuda] runs-on: ${{ matrix.os }} env: @@ -165,6 +197,10 @@ jobs: with: submodules: 'recursive' + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs + - name: Restore ripgrep host tool id: cache-rg uses: actions/cache@v5 @@ -183,9 +219,29 @@ jobs: $rgDir | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append & $rgExe --version + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} + + - name: Verify CUDA Toolkit + run: python .github/scripts/ci_cuda_toolkit.py verify + - name: Pull Image run: | - docker pull "${{ env.image }}:${{ matrix.tag }}" + $image = "${{ env.image }}:${{ matrix.tag }}" + for ($attempt = 1; $attempt -le 3; $attempt++) { + docker pull $image + if ($LASTEXITCODE -eq 0) { + exit 0 + } + Write-Warning "docker pull failed for $image on attempt $attempt." + Start-Sleep -Seconds (15 * $attempt) + } + exit $LASTEXITCODE - name: Run Container run: | @@ -199,8 +255,10 @@ jobs: --env-file .\docker\ci-windows.env ` --env-file .\docker\ninja.env ` --env "NSC_IMAGE_NAME=${{ steps.set-prefix.outputs.nscTargetTaggedImage }}" ` + --env "CUDA_PATH=${{ env.CUDA_CONTAINER_ROOT }}" ` --name orphan --network docker_default ` -v "${{ github.workspace }}:${{ env.mount }}" ` + -v "${{ steps.cuda.outputs.cache_root }}:${{ env.CUDA_CONTAINER_ROOT }}" ` -v "${pipeHost}:\\.\pipe\dockerd" -e "DOCKER_HOST=npipe:////./pipe/dockerd" ` -w "${{ env.mount }}" ` "${{ env.image }}:${{ matrix.tag }}" ` @@ -222,6 +280,7 @@ jobs: ${{ env.entry }} ${{ env.cmd }} -Command cmake ` --preset ci-configure-dynamic-${{ matrix.vendor }} ` -DCMAKE_INSTALL_PREFIX:PATH=C:/mount/nabla/build-ct/install ` + -DNBL_CUDA_TOOLKIT_ROOT:PATH=${{ env.CUDA_CONTAINER_ROOT }} ` --profiling-output=profiling/cmake-profiling.json ` --profiling-format=google-trace @@ -635,7 +694,13 @@ jobs: with: fetch-depth: 1 sparse-checkout: | + .github/scripts smoke + src/nbl/ext/CUDAInterop/smoke + + - name: CUDA Toolkit paths + id: cuda + run: python .github/scripts/ci_cuda_toolkit.py outputs - name: Download VulkanSDK uses: Devsh-Graphics-Programming/install-vulkan-sdk-action@v1.4.0-devsh.1 @@ -646,6 +711,17 @@ jobs: install_lavapipe: true github_token: ${{ github.token }} + - name: Restore CUDA Toolkit + id: cache-cuda + uses: actions/cache@v5 + with: + path: ${{ steps.cuda.outputs.cache_root }} + key: ${{ steps.cuda.outputs.cache_key }} + restore-keys: ${{ steps.cuda.outputs.cache_restore_key }} + + - name: Verify CUDA Toolkit + run: python .github/scripts/ci_cuda_toolkit.py verify + - name: Download Nabla install artifact uses: actions/download-artifact@v8 with: @@ -668,3 +744,15 @@ jobs: - name: Smoke Flow BUILD_ONLY run: cmake -D FLOW=BUILD_ONLY -D CONFIG=${{ matrix.config }} -P smoke/RunSmokeFlow.cmake + + - name: Build CUDA interop package smoke + shell: pwsh + run: | + cmake ` + -S src/nbl/ext/CUDAInterop/smoke ` + -B smoke/cuda-interop-smoke ` + -D "CMAKE_PREFIX_PATH=${{ github.workspace }}\smoke\build-ct\install\cmake" ` + -D "NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE=ON" ` + -D "Nabla_CUDA_TOOLKIT_ROOT=${{ steps.cuda.outputs.cache_root }}" + + cmake --build smoke/cuda-interop-smoke --config ${{ matrix.config }} diff --git a/3rdparty/jitify b/3rdparty/jitify index 0d6dbd8ccd..1a0ca0e837 160000 --- a/3rdparty/jitify +++ b/3rdparty/jitify @@ -1 +1 @@ -Subproject commit 0d6dbd8ccd07e6bfc811d363a54912dfc6d4799a +Subproject commit 1a0ca0e837405506f3b8f7883bacb71c20d86d96 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ba3410075..97ece5d9f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,15 +70,15 @@ else() message(STATUS "Vulkan SDK is not found") endif() -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) +option(NBL_COMPILE_WITH_CUDA "Build CUDA interop support?" OFF) +set(NBL_CUDA_TOOLKIT_ROOT "" CACHE PATH "Optional CUDA Toolkit root used when NBL_COMPILE_WITH_CUDA is ON") if(NBL_COMPILE_WITH_CUDA) - find_package(CUDAToolkit REQUIRED) - if(${CUDAToolkit_VERSION} VERSION_GREATER "9.0") - message(STATUS "CUDA version 9.0+ found!") - else() - message(FATAL_ERROR "CUDA version 9.0+ needed for C++14 support!") + if(NBL_CUDA_TOOLKIT_ROOT) + set(CUDAToolkit_ROOT "${NBL_CUDA_TOOLKIT_ROOT}") endif() + find_package(CUDAToolkit 13.0 REQUIRED) + message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") endif() get_filename_component(NBL_ROOT_PATH "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) @@ -183,13 +183,12 @@ option(NBL_BUILD_IMGUI "Enable nbl::ext::ImGui?" ON) option(NBL_BUILD_DEBUG_DRAW "Enable Nabla Debug Draw extension?" ON) option(NBL_BUILD_OPTIX "Enable nbl::ext::OptiX?" OFF) -if(NBL_COMPILE_WITH_CUDA) - find_package(OPTIX REQUIRED) - message(STATUS "CUDA enabled and OptiX found!") -else() - if(NBL_BUILD_OPTIX) +if(NBL_BUILD_OPTIX) + if(NOT NBL_COMPILE_WITH_CUDA) message(FATAL_ERROR "You cannot build Optix without enabled CUDA! NBL_COMPILE_WITH_CUDA must be ON!") endif() + find_package(OPTIX REQUIRED) + message(STATUS "CUDA enabled and OptiX found!") endif() option(NBL_BUILD_BULLET "Enable Bullet Physics building and integration?" OFF) @@ -313,6 +312,7 @@ if(NBL_ENABLE_CONFIG_INSTALL) set(_NBL_NABLA_CONFIG_FILES "${CMAKE_CURRENT_BINARY_DIR}/NablaConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/NablaConfigVersion.cmake" + "${CMAKE_CURRENT_LIST_DIR}/cmake/NablaCUDAInteropHelpers.cmake" ) install(EXPORT NablaExportTargets diff --git a/CMakePresets.json b/CMakePresets.json index 3c11567f46..2c25d06953 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -14,7 +14,7 @@ "NBL_EMBED_BUILTIN_RESOURCES": "ON", "NBL_NSC_MODE": "SOURCE", "NBL_UPDATE_GIT_SUBMODULE": "OFF", - "NBL_COMPILE_WITH_CUDA": "OFF", + "NBL_COMPILE_WITH_CUDA": "ON", "NBL_BUILD_OPTIX": "OFF", "NBL_BUILD_MITSUBA_LOADER": "ON", "NBL_BUILD_RADEON_RAYS": "OFF", diff --git a/cmake/FindZLIB.cmake b/cmake/FindZLIB.cmake index f855c396b9..42aa789bee 100644 --- a/cmake/FindZLIB.cmake +++ b/cmake/FindZLIB.cmake @@ -4,4 +4,6 @@ endif() set(ZLIB_FOUND TRUE) set(ZLIB_LIBRARY ZLIB::ZLIB) -set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") \ No newline at end of file +set(ZLIB_LIBRARIES ZLIB::ZLIB) +set(ZLIB_INCLUDE_DIR "${THIRD_PARTY_SOURCE_DIR}/zlib;${THIRD_PARTY_BINARY_DIR}/zlib") +set(ZLIB_INCLUDE_DIRS "${ZLIB_INCLUDE_DIR}") diff --git a/cmake/NablaCUDAInteropHelpers.cmake b/cmake/NablaCUDAInteropHelpers.cmake new file mode 100644 index 0000000000..e84b2d1a8e --- /dev/null +++ b/cmake/NablaCUDAInteropHelpers.cmake @@ -0,0 +1,28 @@ +function(nbl_target_link_cuda_interop TARGET_NAME SCOPE) + if(NOT SCOPE MATCHES "^(PRIVATE|PUBLIC|INTERFACE)$") + set(SCOPE PRIVATE) + endif() + cmake_parse_arguments(_NBL_CUDA_INTEROP "" "RUNTIME_JSON" "INCLUDE_DIRS" ${ARGN}) + target_link_libraries("${TARGET_NAME}" ${SCOPE} Nabla::ext::CUDAInterop) + set(_include_dir_entries "") + foreach(_include_dir IN LISTS _NBL_CUDA_INTEROP_INCLUDE_DIRS CUDAToolkit_INCLUDE_DIRS) + if(_include_dir) + file(TO_CMAKE_PATH "${_include_dir}" _include_dir) + list(APPEND _include_dir_entries " \"${_include_dir}\"") + endif() + endforeach() + list(JOIN _include_dir_entries "," _include_dirs_json) + set(_runtime_json [=[ +{ + "cudaRuntimeIncludeDirs": [ +@_include_dirs_json@ + ] +} +]=]) + string(CONFIGURE "${_runtime_json}" _runtime_json @ONLY) + set(_runtime_json_path "$/nbl_cuda_interop_runtime.json") + if(_NBL_CUDA_INTEROP_RUNTIME_JSON) + set(_runtime_json_path "${_NBL_CUDA_INTEROP_RUNTIME_JSON}") + endif() + file(GENERATE OUTPUT "${_runtime_json_path}" CONTENT "${_runtime_json}" TARGET "${TARGET_NAME}") +endfunction() diff --git a/cmake/NablaConfig.cmake.in b/cmake/NablaConfig.cmake.in index b22b3ad0d7..0464340ce3 100644 --- a/cmake/NablaConfig.cmake.in +++ b/cmake/NablaConfig.cmake.in @@ -6,6 +6,7 @@ set(Nabla_DXC_GIT_INFO_JSON_FILE "${PACKAGE_PREFIX_DIR}/include/dxc_git_info.jso set(_NBL_NABLA_LOAD_CORE OFF) set(_NBL_NABLA_LOAD_NSC OFF) +set(_NBL_NABLA_LOAD_CUDA_INTEROP OFF) set(_NBL_NABLA_COMPONENTS ${Nabla_FIND_COMPONENTS}) set(_NBL_NABLA_HAS_CORE_EXPORTS OFF) set(_NBL_NABLA_HAS_NSC_EXPORTS OFF) @@ -25,6 +26,10 @@ if(_NBL_NABLA_COMPONENTS) elseif(_NBL_NABLA_COMPONENT STREQUAL "Core") set(_NBL_NABLA_LOAD_CORE ON) set(Nabla_Core_FOUND TRUE) + elseif(_NBL_NABLA_COMPONENT STREQUAL "CUDAInterop") + set(_NBL_NABLA_LOAD_CORE ON) + set(_NBL_NABLA_LOAD_CUDA_INTEROP ON) + set(Nabla_CUDAInterop_FOUND TRUE) else() set("Nabla_${_NBL_NABLA_COMPONENT}_FOUND" FALSE) endif() @@ -80,6 +85,23 @@ if(_NBL_NABLA_LOAD_NSC) endif() endif() +if(_NBL_NABLA_LOAD_CUDA_INTEROP) + include(CMakeFindDependencyMacro) + + if(DEFINED Nabla_CUDA_TOOLKIT_ROOT AND NOT "${Nabla_CUDA_TOOLKIT_ROOT}" STREQUAL "") + set(CUDAToolkit_ROOT "${Nabla_CUDA_TOOLKIT_ROOT}") + endif() + + find_dependency(CUDAToolkit 13.0 REQUIRED) + _nbl_try_include_component("CUDAInterop" "NablaCUDAInteropExportTargets.cmake" _NBL_NABLA_CUDA_INTEROP_FOUND) + if(_NBL_NABLA_CUDA_INTEROP_FOUND AND TARGET Nabla::ext::CUDAInterop) + target_link_libraries(Nabla::ext::CUDAInterop INTERFACE CUDA::toolkit) + if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/NablaCUDAInteropHelpers.cmake") + endif() + endif() +endif() + check_required_components(Nabla) # diff --git a/examples_tests b/examples_tests index cb6dd1e14c..fcc4a4964a 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit cb6dd1e14c681e8c68257756211ecc9dc3715190 +Subproject commit fcc4a4964acfde19a0fcf60f66d1ec77b74ba136 diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index 3a7cbb5983..99f85e0b72 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -42,6 +42,8 @@ class IBuffer : public IDescriptor, public core::IBuffer //! synthetic Nabla inventions // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, + + EUF_SYNTHETIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/ext/CUDAInterop/CUDAInteropNative.h b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h new file mode 100644 index 0000000000..ea360d785a --- /dev/null +++ b/include/nbl/ext/CUDAInterop/CUDAInteropNative.h @@ -0,0 +1,58 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +/* + CUDA SDK opt-in boundary for Nabla CUDA interop. + + Public nbl/video CUDA interop headers expose SDK-free cuda_interop::SCU* opaque handles. This header is the + explicit boundary where a consumer accepts CUDA/NVRTC SDK headers, raw CU* types, and Nabla helper APIs whose + signatures use CUDA SDK types. This happens by linking Nabla::ext::CUDAInterop and including this file, which + includes cuda.h and nvrtc.h. The CUDA SDK becomes a compile-time requirement only for that SDK opt-in + consumer. + + The exported definitions stay in Nabla because they are glue between the Nabla world and the CUDA world: + dynamic Driver API/NVRTC loader access, NVRTC program helpers, error handling, runtime header discovery, and + CUDA/Vulkan resource interop lifetime. This header only exposes the CUDA-typed signatures for that glue after + the consumer explicitly opts in. Nabla::ext::CUDAInterop is the build-system edge for this SDK-typed surface. + It is not a separate owner of these definitions. Code that only consumes Nabla::Nabla does not need CUDA SDK + headers and does not parse CUDA/NVRTC declarations. + + Keeping SDK-defined types out of Nabla's public ABI is intentional. CUDA headers have changed observable + compile-time constants across SDK versions: + - CUDA Toolkit 9.0 documented CU_CTX_FLAGS_MASK as 0x1f. CUDA 12.1, 12.5, and 13.2 define it as 0xff. + - CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS as 93. CUDA 12.1, 12.5, + and 13.2 keep 93 as CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V1 and define the unsuffixed name + as 122. + - CUDA Toolkit 9.0 documented CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR as 94. CUDA 12.1, 12.5, + and 13.2 keep 94 as CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V1 and define the unsuffixed name + as 123. + + If these SDK declarations leak through public Nabla headers, consumers can silently compile against a + different CUDA interpretation than the one used to build the interop implementation. That is especially + problematic for installed packages, plugins, and separately built downstream projects. The opaque handles + keep Nabla's public ABI independent from CUDA SDK headers. This opt-in header then validates handle + size/alignment against the SDK selected by the SDK opt-in consumer. +*/ +#ifndef _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ +#define _NBL_EXT_CUDA_INTEROP_NATIVE_H_INCLUDED_ +#include "nbl/video/CUDAInteropNativeAPI.h" +namespace nbl::video::cuda_native +{ + +/* + This header specializes the SDK-free opaque handles from nbl/video/CUDAInteropHandles.h for the CUDA SDK + visible to this translation unit. After that opt-in, Nabla interop methods can be called with native CUDA/NVRTC + types such as CUdeviceptr, CUexternalSemaphore, nvrtcProgram, CUresult, and nvrtcResult. + + The size/alignment checks live in nbl/video/CUDAInteropNativeAPI.h. This exact version check is a policy helper + for SDK-typed code that wants to warn about or reject compatible-but-different SDK headers. +*/ +inline bool isBuildCUDASDKVersionExactMatch() +{ + const auto buildVersion = CCUDAHandler::getBuildCUDASDKVersion(); + return buildVersion==CUDA_VERSION; +} + +} + +#endif diff --git a/include/nbl/ext/OptiX/IDenoiser.h b/include/nbl/ext/OptiX/IDenoiser.h index 7820aa1222..bb0677657d 100644 --- a/include/nbl/ext/OptiX/IDenoiser.h +++ b/include/nbl/ext/OptiX/IDenoiser.h @@ -5,7 +5,7 @@ #ifndef __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ #define __NBL_EXT_OPTIX_DENOISER_H_INCLUDED__ -#include "../../../../src/nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDAHandler.h" #include #include @@ -122,4 +122,4 @@ class IDenoiser final : public core::IReferenceCounted } } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/system/DefaultFuncPtrLoader.h b/include/nbl/system/DefaultFuncPtrLoader.h index 56142448c8..10fab3a454 100644 --- a/include/nbl/system/DefaultFuncPtrLoader.h +++ b/include/nbl/system/DefaultFuncPtrLoader.h @@ -11,18 +11,18 @@ namespace nbl::system { -class DefaultFuncPtrLoader final : FuncPtrLoader +class NBL_API2 DefaultFuncPtrLoader final : FuncPtrLoader { void* lib; public: inline DefaultFuncPtrLoader() : lib(nullptr) {} - NBL_API2 DefaultFuncPtrLoader(const char* name); + DefaultFuncPtrLoader(const char* name); inline DefaultFuncPtrLoader(DefaultFuncPtrLoader&& other) : DefaultFuncPtrLoader() { operator=(std::move(other)); } - NBL_API2 ~DefaultFuncPtrLoader(); + ~DefaultFuncPtrLoader(); inline DefaultFuncPtrLoader& operator=(DefaultFuncPtrLoader&& other) { @@ -40,4 +40,4 @@ class DefaultFuncPtrLoader final : FuncPtrLoader } -#endif \ No newline at end of file +#endif diff --git a/include/nbl/system/DynamicFunctionCaller.h b/include/nbl/system/DynamicFunctionCaller.h index cf99be32f0..d5642d3ea9 100644 --- a/include/nbl/system/DynamicFunctionCaller.h +++ b/include/nbl/system/DynamicFunctionCaller.h @@ -16,7 +16,7 @@ class DynamicFunctionCallerBase : public core::Unmovable { protected: static_assert(std::is_base_of::value, "Need a function pointer loader derived from `FuncPtrLoader`"); - FuncPtrLoaderT loader; + mutable FuncPtrLoaderT loader; public: //DynamicFunctionCallerBase() : loader() {} DynamicFunctionCallerBase(DynamicFunctionCallerBase&& other) : DynamicFunctionCallerBase() @@ -29,6 +29,16 @@ class DynamicFunctionCallerBase : public core::Unmovable } virtual ~DynamicFunctionCallerBase() = default; + inline bool isLibraryLoaded() const + { + return loader.isLibraryLoaded(); + } + + inline void* loadFuncPtr(const char* funcname) const + { + return loader.loadFuncPtr(funcname); + } + DynamicFunctionCallerBase& operator=(DynamicFunctionCallerBase&& other) { std::swap(loader, other.loader); @@ -41,6 +51,8 @@ class DynamicFunctionCallerBase : public core::Unmovable #define NBL_SYSTEM_IMPL_INIT_DYNLIB_FUNCPTR(FUNC_NAME) ,NBL_CONCATENATE(p , FUNC_NAME)(Base::loader.loadFuncPtr( #FUNC_NAME )) #define NBL_SYSTEM_IMPL_SWAP_DYNLIB_FUNCPTR(FUNC_NAME) std::swap(NBL_CONCATENATE(p, FUNC_NAME),other.NBL_CONCATENATE(p, FUNC_NAME)); +// Load an extra function from an already loaded dynamic library without adding it to the generated caller class. +#define NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(CALLER, FUNC_NAME) nbl::system::DynamicLibraryFunctionPointer((CALLER).loadFuncPtr(#FUNC_NAME)) #define NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS( CLASS_NAME, FUNC_PTR_LOADER_TYPE, ... ) \ class CLASS_NAME : public nbl::system::DynamicFunctionCallerBase\ diff --git a/include/nbl/system/ExternalHandle.h b/include/nbl/system/ExternalHandle.h new file mode 100644 index 0000000000..baac27a0b4 --- /dev/null +++ b/include/nbl/system/ExternalHandle.h @@ -0,0 +1,56 @@ +#ifndef __NBL_EXTERNAL_HANDLE_INCLUDED__ +#define __NBL_EXTERNAL_HANDLE_INCLUDED__ + +#ifdef _WIN32 + #ifndef WIN32_LEAN_AND_MEAN + #define WIN32_LEAN_AND_MEAN + #endif + #include +#else + #include +#endif + +namespace nbl::system +{ + +using external_handle_t = +#ifdef _WIN32 + void* +#else + int +#endif + ; + +#ifdef _WIN32 +constexpr external_handle_t ExternalHandleNull = nullptr; +#else +constexpr external_handle_t ExternalHandleNull = -1; +#endif + +inline bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return close(handle) == 0; +#endif +} + +inline external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE duplicated = ExternalHandleNull; + + const HANDLE process = GetCurrentProcess(); + if (!DuplicateHandle(process, handle, process, &duplicated, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return duplicated; +#else + return dup(handle); +#endif +} + +} + +#endif diff --git a/include/nbl/system/declarations.h b/include/nbl/system/declarations.h index fa3dc2c6da..de632fb110 100644 --- a/include/nbl/system/declarations.h +++ b/include/nbl/system/declarations.h @@ -41,4 +41,7 @@ // frameworks (ugh, doesn't work!) //#include "nbl/system/IApplicationFramework.h" +// Handle for import and export gpu resource +#include "nbl/system/ExternalHandle.h" + #endif \ No newline at end of file diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 1120224fdb..d6a1378dcb 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -4,29 +4,29 @@ #ifndef _NBL_VIDEO_C_CUDA_DEVICE_H_ #define _NBL_VIDEO_C_CUDA_DEVICE_H_ +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDAImportedSemaphore.h" -#include "nbl/video/IPhysicalDevice.h" - - -#ifdef _NBL_COMPILE_WITH_CUDA_ - -#include "cuda.h" -#include "nvrtc.h" -#if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." -#endif - -// useful includes in the future -//#include "cudaEGL.h" -//#include "cudaVDPAU.h" +#include +#include +#include namespace nbl::video { class CCUDAHandler; -class CCUDADevice : public core::IReferenceCounted +class NBL_API2 CCUDADevice : public core::IReferenceCounted { - public: + public: +#ifdef _WIN32 + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; +#else + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; +#endif + enum E_VIRTUAL_ARCHITECTURE { EVA_30, @@ -45,154 +45,57 @@ class CCUDADevice : public core::IReferenceCounted EVA_80, EVA_COUNT }; - static inline constexpr const char* virtualArchCompileOption[] = { - "-arch=compute_30", - "-arch=compute_32", - "-arch=compute_35", - "-arch=compute_37", - "-arch=compute_50", - "-arch=compute_52", - "-arch=compute_53", - "-arch=compute_60", - "-arch=compute_61", - "-arch=compute_62", - "-arch=compute_70", - "-arch=compute_72", - "-arch=compute_75", - "-arch=compute_80" - }; - inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} + E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() const; - inline core::SRange geDefaultCompileOptions() const - { - return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; - } + ~CCUDADevice() override; - // TODO/REDO Vulkan: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vulkan-interoperability - // Watch out, use Driver API (`cu` functions) NOT the Runtime API (`cuda` functions) - // Also maybe separate this out into its own `CCUDA` class instead of nesting it here? -#if 0 - template - struct GraphicsAPIObjLink - { - GraphicsAPIObjLink() : obj(nullptr), cudaHandle(nullptr), acquired(false) - { - asImage = {nullptr}; - } - GraphicsAPIObjLink(core::smart_refctd_ptr&& _obj) : GraphicsAPIObjLink() - { - obj = std::move(_obj); - } - GraphicsAPIObjLink(GraphicsAPIObjLink&& other) : GraphicsAPIObjLink() - { - operator=(std::move(other)); - } - - GraphicsAPIObjLink(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(GraphicsAPIObjLink&& other) - { - std::swap(obj,other.obj); - std::swap(cudaHandle,other.cudaHandle); - std::swap(acquired,other.acquired); - std::swap(asImage,other.asImage); - return *this; - } - - ~GraphicsAPIObjLink() - { - assert(!acquired); // you've fucked up, there's no way for us to fix it, you need to release the objects on a proper stream - if (obj) - CCUDAHandler::cuda.pcuGraphicsUnregisterResource(cudaHandle); - } - - // - auto* getObject() const {return obj.get();} - - private: - core::smart_refctd_ptr obj; - CUgraphicsResource cudaHandle; - bool acquired; - - friend class CCUDAHandler; - public: - union - { - struct - { - CUdeviceptr pointer; - } asBuffer; - struct - { - CUmipmappedArray mipmappedArray; - CUarray array; - } asImage; - }; - }; + core::SRange geDefaultCompileOptions() const; - // - static CUresult registerBuffer(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - static CUresult registerImage(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - + const CCUDAHandler* getHandler() const; + cuda_interop::SCUdevice getInternalObject() const; + cuda_interop::SCUcontext getContext() const; - template - static CUresult acquireResourcesFromGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) + struct SExportableMemoryCreationParams { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsMapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = true; - return retval; - } - template - static CUresult releaseResourcesToGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) + size_t size; + uint32_t alignment; + uint32_t locationType; + }; + + inline size_t roundToGranularity(uint32_t locationType, size_t size) const { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsUnmapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = false; - return retval; + if (locationType>=m_allocationGranularity.size()) + return 0u; + const auto granularity = m_allocationGranularity[locationType]; + if (size==0u || granularity==0u) + return 0u; + return ((size - 1) / granularity + 1) * granularity; } - static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); - static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); -#endif + core::smart_refctd_ptr createExportableMemory(SExportableMemoryCreationParams&& params); + core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); - protected: + core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); + + private: friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); - ~CCUDADevice() = default; - + + static constexpr uint32_t AllocationGranularityLocationTypeCount = 5u; + struct SNativeState; + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, std::unique_ptr&& nativeState, core::smart_refctd_ptr&& handler); + bool isValid() const; + + const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - IPhysicalDevice* const m_vulkanDevice; + std::array m_allocationGranularity = {}; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + bool m_valid = false; + + core::smart_refctd_ptr m_handler; + std::unique_ptr m_native; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h new file mode 100644 index 0000000000..126be44f37 --- /dev/null +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -0,0 +1,60 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ + +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" + +#include +#include + +namespace nbl::video +{ +class CCUDADevice; + +class NBL_API2 CCUDAExportableMemory final : public core::IReferenceCounted +{ + public: + struct SCachedCreationParams + { + size_t granularSize; + system::external_handle_t externalHandle; + bool deviceLocal; + }; + + ~CCUDAExportableMemory() override; + + cuda_interop::SCUdeviceptr getDeviceptr() const; + + /** + * @brief Exports the CUDA memory as a Vulkan device memory allocation. + * + * Creates an IDeviceMemoryAllocation object that references the underlying CUDA memory, + * allowing it to be used within the Vulkan rendering pipeline while maintaining + * interoperability with CUDA operations. + * + * @param device The logical device that will own the exported memory allocation. + * @param dedication Optional pointer to a device memory backed resource for dedicated allocation. + * If provided, the memory will be dedicated to that specific resource and + * automatically bound to it. + * @return A smart pointer to the exported IDeviceMemoryAllocation, or nullptr on failure. + */ + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + private: + friend class CCUDADevice; + + struct SNativeState; + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + static core::smart_refctd_ptr create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState); + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; + std::unique_ptr m_native; +}; + +} + +#endif diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..4d2324cfa6 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -7,137 +7,92 @@ #include "nbl/core/declarations.h" #include "nbl/core/definitions.h" +#include "nbl/asset/ICPUBuffer.h" #include "nbl/system/declarations.h" +#include "nbl/system/path.h" +#include "nbl/video/CUDAInteropHandles.h" -#include "nbl/video/CCUDADevice.h" +#include +#include +#include +#include - -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { +class CCUDADevice; +class CVulkanConnection; +class IPhysicalDevice; -class CCUDAHandler : public core::IReferenceCounted +namespace cuda_native { - public: - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); - inline bool defaultHandleResult(CUresult result) - { - core::smart_refctd_ptr logger = m_logger.get(); - return defaultHandleResult(result,logger.get()); - } +// SDK-free forward declarations for the dynamic CUDA/NVRTC tables exposed by the opt-in native header. +class CUDA; +class NVRTC; +} - // - bool defaultHandleResult(nvrtcResult result); +namespace cuda_interop +{ +inline constexpr const char* RuntimePathsFileName = "nbl_cuda_interop_runtime.json"; +inline constexpr uint32_t RuntimeVersionComponentCount = 2u; +using SRuntimeVersion = std::array; - // - template - static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } +struct SRuntimeIncludeDir +{ + system::path path; + std::string source; + uint32_t cudaVersion = 0u; + bool completeRuntimeHeaderSet = false; +}; - // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); +struct SRuntimeCompileEnvironment +{ + core::vector includeDirs; + core::vector includeDirInfos; +}; - // - using LibLoader = system::DefaultFuncPtrLoader; - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v2 - ,cuDevicePrimaryCtxRetain - ,cuDevicePrimaryCtxRelease - ,cuDevicePrimaryCtxSetFlags - ,cuDevicePrimaryCtxGetState - ,cuCtxDestroy_v2 - ,cuCtxEnablePeerAccess - ,cuCtxGetApiVersion - ,cuCtxGetCurrent - ,cuCtxGetDevice - ,cuCtxGetSharedMemConfig - ,cuCtxPopCurrent_v2 - ,cuCtxPushCurrent_v2 - ,cuCtxSetCacheConfig - ,cuCtxSetCurrent - ,cuCtxSetSharedMemConfig - ,cuCtxSynchronize - ,cuDeviceComputeCapability - ,cuDeviceCanAccessPeer - ,cuDeviceGetCount - ,cuDeviceGet - ,cuDeviceGetAttribute - ,cuDeviceGetLuid - ,cuDeviceGetUuid - ,cuDeviceTotalMem_v2 - ,cuDeviceGetName - ,cuDriverGetVersion - ,cuEventCreate - ,cuEventDestroy_v2 - ,cuEventElapsedTime - ,cuEventQuery - ,cuEventRecord - ,cuEventSynchronize - ,cuFuncGetAttribute - ,cuFuncSetCacheConfig - ,cuGetErrorName - ,cuGetErrorString - ,cuGraphicsMapResources - ,cuGraphicsResourceGetMappedPointer_v2 - ,cuGraphicsResourceGetMappedMipmappedArray - ,cuGraphicsSubResourceGetMappedArray - ,cuGraphicsUnmapResources - ,cuGraphicsUnregisterResource - ,cuInit - ,cuLaunchKernel - ,cuMemAlloc_v2 - ,cuMemcpyDtoD_v2 - ,cuMemcpyDtoH_v2 - ,cuMemcpyHtoD_v2 - ,cuMemcpyDtoDAsync_v2 - ,cuMemcpyDtoHAsync_v2 - ,cuMemcpyHtoDAsync_v2 - ,cuMemGetAddressRange_v2 - ,cuMemFree_v2 - ,cuMemFreeHost - ,cuMemGetInfo_v2 - ,cuMemHostAlloc - ,cuMemHostRegister_v2 - ,cuMemHostUnregister - ,cuMemsetD32_v2 - ,cuMemsetD32Async - ,cuMemsetD8_v2 - ,cuMemsetD8Async - ,cuModuleGetFunction - ,cuModuleGetGlobal_v2 - ,cuModuleLoadDataEx - ,cuModuleLoadFatBinary - ,cuModuleUnload - ,cuOccupancyMaxActiveBlocksPerMultiprocessor - ,cuPointerGetAttribute - ,cuStreamAddCallback - ,cuStreamCreate - ,cuStreamDestroy_v2 - ,cuStreamQuery - ,cuStreamSynchronize - ,cuStreamWaitEvent - ,cuSurfObjectCreate - ,cuSurfObjectDestroy - ,cuTexObjectCreate - ,cuTexObjectDestroy - ); - const CUDA& getCUDAFunctionTable() const {return m_cuda;} +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs); +NBL_API2 SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs, const core::vector& runtimePathFiles); +inline core::vector makeNVRTCIncludeOptions(const SRuntimeCompileEnvironment& environment) +{ + core::vector options; + for (const auto& includeDir : environment.includeDirs) + options.push_back("-I" + includeDir.generic_string()); + return options; +} +} + +class NBL_API2 CCUDAHandler : public core::IReferenceCounted +{ + public: + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static uint32_t getBuildCUDASDKVersion(); + uint32_t getLoadedCUDADriverVersion() const; + cuda_interop::SRuntimeVersion getLoadedNVRTCVersion() const; + const cuda_native::CUDA& getCUDAFunctionTable() const; + const cuda_native::NVRTC& getNVRTCFunctionTable() const; + core::SRange getDefaultRuntimeIncludeOptions() const; + inline system::logger_opt_ptr getLogger() const { return m_logger.getOptRawPtr(); } + + struct SPTXResult + { + core::smart_refctd_ptr ptx; + cuda_interop::SNVRTCResult result; + }; - NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, - nvrtcGetErrorString, - nvrtcVersion, - nvrtcAddNameExpression, - nvrtcCompileProgram, - nvrtcCreateProgram, - nvrtcDestroyProgram, - nvrtcGetLoweredName, - nvrtcGetPTX, - nvrtcGetPTXSize, - nvrtcGetProgramLog, - nvrtcGetProgramLogSize + static bool defaultHandleResult(cuda_interop::SCUresult result, const system::logger_opt_ptr& logger); + bool defaultHandleResult(cuda_interop::SCUresult result) const; + bool defaultHandleResult(cuda_interop::SNVRTCResult result) const; + + cuda_interop::SNVRTCResult createProgram(cuda_interop::SOutput prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); + cuda_interop::SNVRTCResult compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const; + cuda_interop::SNVRTCResult getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const; + SPTXResult getPTX(cuda_interop::SNVRTCProgram prog) const; + SPTXResult compileDirectlyToPTX( + std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string* log=nullptr, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr ); - const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;} - // inline core::SRange getSTDHeaders() { auto begin = m_headers.empty() ? nullptr:(&m_headers[0].get()); @@ -146,130 +101,34 @@ class CCUDAHandler : public core::IReferenceCounted inline const auto& getSTDHeaderContents() { return m_headerContents; } inline const auto& getSTDHeaderNames() { return m_headerNames; } - // - nvrtcResult createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr); - inline nvrtcResult createProgram(nvrtcProgram* prog, const char* source, const char* name, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) + struct SCUDADeviceInfo { - return createProgram(prog,std::string(source),name,headerCount,headerContents,includeNames); - } - inline nvrtcResult createProgram(nvrtcProgram* prog, system::IFile* file, const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr) - { - const auto filesize = file->getSize(); - std::string source(filesize+1u,'0'); - - system::future bytesRead; - file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); - - return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); - } - - // - inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) - { - return m_nvrtc.pnvrtcCompileProgram(prog,options.size(),options.begin()); - } - - // - nvrtcResult getProgramLog(nvrtcProgram prog, std::string& log); - - // - struct ptx_and_nvrtcResult_t - { - core::smart_refctd_ptr ptx; - nvrtcResult result; + std::array uuid = {}; }; - ptx_and_nvrtcResult_t getPTX(nvrtcProgram prog); - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - std::string&& source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - result = createProgram(&program,std::move(source),filename,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - const char* source, const char* filename, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) - { - return compileDirectlyToPTX(std::string(source),filename,nvrtcOptions,headerCount,headerContents,includeNames,log); - } - inline ptx_and_nvrtcResult_t compileDirectlyToPTX( - system::IFile* file, core::SRange nvrtcOptions, - const int headerCount=0, const char* const* headerContents=nullptr, const char* const* includeNames=nullptr, - std::string* log=nullptr - ) + inline core::vector const& getAvailableDevices() const { - nvrtcProgram program = nullptr; - nvrtcResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; - auto cleanup = core::makeRAIIExiter([&]() -> void - { - if (result!=NVRTC_SUCCESS && program) - m_nvrtc.pnvrtcDestroyProgram(&program); // TODO: do we need to destroy the program if we successfully get PTX? - }); - - result = createProgram(&program,file,headerCount,headerContents,includeNames); - return compileDirectlyToPTX_impl(result,program,nvrtcOptions,log); + return m_availableDevices; } core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } - ~CCUDAHandler() = default; - - // - inline ptx_and_nvrtcResult_t compileDirectlyToPTX_impl(nvrtcResult result, nvrtcProgram program, core::SRange nvrtcOptions, std::string* log) - { - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; + ~CCUDAHandler() override; - result = compileProgram(program,nvrtcOptions); - if (log) - getProgramLog(program,*log); - if (result!=NVRTC_SUCCESS) - return {nullptr,result}; - - return getPTX(program); - } + private: + struct SNativeState; + CCUDAHandler(std::unique_ptr&& nativeState, core::vector>&& _headers, core::smart_refctd_ptr&& _logger); - // function tables - CUDA m_cuda; - NVRTC m_nvrtc; - - // + std::unique_ptr m_native; + core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; core::vector m_headerNames; system::logger_opt_smart_ptr m_logger; - int m_version; }; } -#endif // _NBL_COMPILE_WITH_CUDA_ - #endif diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h new file mode 100644 index 0000000000..7dde4908af --- /dev/null +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -0,0 +1,34 @@ +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H_ + +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" + +#include + +namespace nbl::video +{ + +class CCUDADevice; + +class NBL_API2 CCUDAImportedMemory final : public core::IReferenceCounted +{ + public: + ~CCUDAImportedMemory() override; + cuda_interop::SCUexternalMemory getInternalObject() const; + bool getMappedBuffer(cuda_interop::SOutput mappedBuffer) const; + + private: + friend class CCUDADevice; + + struct SNativeState; + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; +}; + +} + +#endif diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h new file mode 100644 index 0000000000..204e1b79f3 --- /dev/null +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -0,0 +1,37 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ + +#include "nbl/video/declarations.h" +#include "nbl/video/CUDAInteropHandles.h" + +#include +#include + +namespace nbl::video +{ + +class CCUDADevice; + +class NBL_API2 CCUDAImportedSemaphore final : public core::IReferenceCounted +{ + public: + ~CCUDAImportedSemaphore() override; + cuda_interop::SCUexternalSemaphore getInternalObject() const; + + private: + friend class CCUDADevice; + + struct SNativeState; + CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState); + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + std::unique_ptr m_native; +}; + +} + +#endif diff --git a/include/nbl/video/CUDAInterop.h b/include/nbl/video/CUDAInterop.h new file mode 100644 index 0000000000..efea886b96 --- /dev/null +++ b/include/nbl/video/CUDAInterop.h @@ -0,0 +1,14 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_H_INCLUDED_ + +#include "nbl/video/CUDAInteropHandles.h" +#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDAHandler.h" +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDAImportedSemaphore.h" + +#endif diff --git a/include/nbl/video/CUDAInteropHandles.h b/include/nbl/video/CUDAInteropHandles.h new file mode 100644 index 0000000000..a7664310aa --- /dev/null +++ b/include/nbl/video/CUDAInteropHandles.h @@ -0,0 +1,146 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_HANDLES_H_INCLUDED_ + +#include +#include +#include +#include + +namespace nbl::video::cuda_interop +{ + +/* + SDK-free CUDA interop boundary. + + Public nbl/video/CCUDA*.h headers cannot include cuda.h or nvrtc.h, but they still need to carry CUDA interop + state and write CUDA/NVRTC handles for opt-in users. The split below keeps those two roles explicit: + - SOpaqueCUDAHandle owns handle bits and is used in Nabla object layout, parameters, and return values. + - SOutput is a non-owning output adapter. C++ does not apply user-defined conversions through T* or mutable T&, + so output parameters need a small bridge to write directly into either SCU* storage or native SDK storage. + + CUDAInteropNative.h is the only header that maps these opaque types back to CUDA/NVRTC SDK types. These helpers + are class templates with in-class member definitions, so they are inline by the language rules and add no exported + symbols. +*/ +template +struct SOpaqueCUDANativeType; + +template +concept cuda_opaque_handle = + std::is_trivially_copyable_v && + std::is_trivially_copyable_v && + sizeof(Opaque)==sizeof(Native) && + alignof(Opaque)==alignof(Native); + +template +concept cuda_native_handle_for = + requires { typename SOpaqueCUDANativeType::type; } && + std::same_as,typename SOpaqueCUDANativeType::type> && + cuda_opaque_handle>; + +/* + Non-owning output bridge for SDK-free APIs. It keeps one Nabla signature while opt-in callers can pass raw + CUDA/NVRTC output variables directly, e.g. `CUdeviceptr ptr; memory->getMappedBuffer(ptr);`. +*/ +template +struct SOutput +{ + SOutput(std::nullptr_t) : ptr(nullptr) {} + SOutput(Opaque& opaque) : ptr(&opaque) {} + SOutput(Opaque* opaque) : ptr(opaque) {} + + template + requires cuda_native_handle_for + SOutput(Native& native) : ptr(reinterpret_cast(&native)) {} + + template + requires cuda_native_handle_for + SOutput(Native* native) : ptr(reinterpret_cast(native)) {} + + Opaque& operator*() const { return *ptr; } + operator Opaque*() const { return ptr; } + explicit operator bool() const { return ptr!=nullptr; } + + private: + Opaque* ptr; +}; + +/* + Owned opaque value used in public Nabla ABI. Native reference conversions become available only after the opt-in + header specializes SOpaqueCUDANativeType for the selected CUDA SDK. +*/ +template +struct alignas(alignof(Storage)) SOpaqueCUDAHandle +{ + uint8_t value[sizeof(Storage)] = {}; + + SOpaqueCUDAHandle() = default; + + template + requires cuda_native_handle_for + SOpaqueCUDAHandle(const Native& native) + { + operator=(native); + } + + template + requires cuda_native_handle_for + operator Native&() + { + return *reinterpret_cast(value); + } + + template + requires cuda_native_handle_for + operator const Native&() const + { + return *reinterpret_cast(value); + } + + template + requires cuda_native_handle_for + Derived& operator=(const Native& native) + { + static_cast(*this) = native; + return static_cast(*this); + } + + template + requires cuda_native_handle_for + friend bool operator==(const Derived& lhs, const Native& rhs) + { + return static_cast(lhs)==rhs; + } + + template + requires cuda_native_handle_for + friend bool operator==(const Native& lhs, const Derived& rhs) + { + return lhs==static_cast(rhs); + } +}; + +#define NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(NAME, STORAGE) \ + struct NAME : SOpaqueCUDAHandle \ + { \ + using SOpaqueCUDAHandle::SOpaqueCUDAHandle; \ + using SOpaqueCUDAHandle::operator=; \ + } + +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdevice, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUcontext, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUdeviceptr, uintptr_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalMemory, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUexternalSemaphore, void*); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SCUresult, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCResult, int32_t); +NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE(SNVRTCProgram, void*); + +#undef NBL_CUDA_INTEROP_DECLARE_OPAQUE_HANDLE + +} + +#endif diff --git a/include/nbl/video/CUDAInteropNativeAPI.h b/include/nbl/video/CUDAInteropNativeAPI.h new file mode 100644 index 0000000000..6084d4a00c --- /dev/null +++ b/include/nbl/video/CUDAInteropNativeAPI.h @@ -0,0 +1,191 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_API_H_INCLUDED_ + +#include +#include + +#include "nbl/video/CUDAInterop.h" +#include "nbl/system/DynamicFunctionCaller.h" + +#include "cuda.h" +#include "nvrtc.h" + +namespace nbl::video::cuda_interop +{ + +#define NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(OPAQUE, NATIVE) \ + template<> struct SOpaqueCUDANativeType { using type = NATIVE; }; \ + static_assert(cuda_opaque_handle) + +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdevice, CUdevice); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUcontext, CUcontext); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUdeviceptr, CUdeviceptr); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalMemory, CUexternalMemory); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUexternalSemaphore, CUexternalSemaphore); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SCUresult, CUresult); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCResult, nvrtcResult); +NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE(SNVRTCProgram, nvrtcProgram); + +#undef NBL_CUDA_INTEROP_DECLARE_NATIVE_HANDLE + +} + +namespace nbl::video::cuda_native +{ + +inline constexpr int MinimumCUDADriverVersion = 13000; +inline constexpr int MinimumNVRTCMajorVersion = MinimumCUDADriverVersion/1000; +static_assert(CUDA_VERSION >= MinimumCUDADriverVersion, "Need CUDA 13.0 SDK or higher."); + +/* + Low-level CUDA SDK boundary shared by Nabla's CUDA implementation and explicit CUDA interop opt-in users. + + This file lives under include/ because it is shared with nbl/ext/CUDAInterop/CUDAInteropNative.h, the public + opt-in header for consumers that explicitly accept CUDA SDK types. Its physical location does not make it part + of the default Nabla public interface: nbl/video/CCUDA*.h headers, Nabla::Nabla public requirements, and PCH + do not include it, so normal Nabla consumers do not need cuda.h or nvrtc.h. + + The declarations below intentionally use CUDA/NVRTC SDK types because they describe the SDK-typed glue between + raw CUDA code and Nabla's exported CUDA interop objects. Consumers enter this surface only by linking + Nabla::ext::CUDAInterop and including nbl/ext/CUDAInterop/CUDAInteropNative.h. +*/ +using LibLoader = system::DefaultFuncPtrLoader; + +/* + The CUDA/NVRTC table classes contain the calls used and tested by Nabla's interop implementation. SDK opt-in + consumers can load additional Driver API or NVRTC symbols from the same table without changing Nabla's ABI: + + auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); + + The requested symbol must be declared by the CUDA SDK visible to that translation unit because the helper uses + decltype(cuNewCall) to preserve the native function signature. +*/ +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader + ,cuCtxCreate_v4 + ,cuDevicePrimaryCtxRetain + ,cuDevicePrimaryCtxRelease + ,cuDevicePrimaryCtxSetFlags + ,cuDevicePrimaryCtxGetState + ,cuCtxDestroy_v2 + ,cuCtxEnablePeerAccess + ,cuCtxGetApiVersion + ,cuCtxGetCurrent + ,cuCtxGetDevice + ,cuCtxGetSharedMemConfig + ,cuCtxPopCurrent_v2 + ,cuCtxPushCurrent_v2 + ,cuCtxSetCacheConfig + ,cuCtxSetCurrent + ,cuCtxSetSharedMemConfig + ,cuCtxSynchronize + ,cuDeviceComputeCapability + ,cuDeviceCanAccessPeer + ,cuDeviceGetCount + ,cuDeviceGet + ,cuDeviceGetAttribute + ,cuDeviceGetLuid + ,cuDeviceGetUuid_v2 + ,cuDeviceTotalMem_v2 + ,cuDeviceGetName + ,cuDriverGetVersion + ,cuEventCreate + ,cuEventDestroy_v2 + ,cuEventElapsedTime + ,cuEventQuery + ,cuEventRecord + ,cuEventSynchronize + ,cuFuncGetAttribute + ,cuFuncSetCacheConfig + ,cuGetErrorName + ,cuGetErrorString + ,cuGraphicsMapResources + ,cuGraphicsResourceGetMappedPointer_v2 + ,cuGraphicsResourceGetMappedMipmappedArray + ,cuGraphicsSubResourceGetMappedArray + ,cuGraphicsUnmapResources + ,cuGraphicsUnregisterResource + ,cuInit + ,cuLaunchKernel + ,cuMemAlloc_v2 + ,cuMemcpyDtoD_v2 + ,cuMemcpyDtoH_v2 + ,cuMemcpyHtoD_v2 + ,cuMemcpyDtoDAsync_v2 + ,cuMemcpyDtoHAsync_v2 + ,cuMemcpyHtoDAsync_v2 + ,cuMemGetAddressRange_v2 + ,cuMemFree_v2 + ,cuMemFreeHost + ,cuMemGetInfo_v2 + ,cuMemHostAlloc + ,cuMemHostRegister_v2 + ,cuMemHostUnregister + ,cuMemsetD32_v2 + ,cuMemsetD32Async + ,cuMemsetD8_v2 + ,cuMemsetD8Async + ,cuModuleGetFunction + ,cuModuleGetGlobal_v2 + ,cuModuleLoadDataEx + ,cuModuleLoadFatBinary + ,cuModuleUnload + ,cuOccupancyMaxActiveBlocksPerMultiprocessor + ,cuPointerGetAttribute + ,cuStreamAddCallback + ,cuStreamCreate + ,cuStreamDestroy_v2 + ,cuStreamQuery + ,cuStreamSynchronize + ,cuStreamWaitEvent + ,cuSurfObjectCreate + ,cuSurfObjectDestroy + ,cuTexObjectCreate + ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback +); + +NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(NVRTC,LibLoader, + nvrtcGetErrorString, + nvrtcVersion, + nvrtcAddNameExpression, + nvrtcCompileProgram, + nvrtcCreateProgram, + nvrtcDestroyProgram, + nvrtcGetLoweredName, + nvrtcGetPTX, + nvrtcGetPTXSize, + nvrtcGetProgramLog, + nvrtcGetProgramLogSize +); + +#define NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler) \ + do { \ + const auto nblCudaInteropResult = (expr); \ + if (!(handler)->defaultHandleResult(nblCudaInteropResult)) \ + assert(false); \ + } while (false) + +} + +#endif diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index e6d17ddf3e..696d69058f 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -35,11 +35,11 @@ class CVulkanDeviceMemoryBacked : public Interface protected: // special constructor for when memory requirements are known up-front (so far only swapchains and internal forwarding here) CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const IDeviceMemoryBacked::SDeviceMemoryRequirements& _memReqs, const VkResource_t vkHandle); - CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const VkResource_t vkHandle) : - CVulkanDeviceMemoryBacked(dev,std::move(_creationParams),obtainRequirements(dev,vkHandle),vkHandle) {} + CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, bool dedicatedOnly, const VkResource_t vkHandle) : + CVulkanDeviceMemoryBacked(dev,std::move(_creationParams), obtainRequirements(dev, dedicatedOnly, vkHandle),vkHandle) {} private: - static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle); + static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle); core::smart_refctd_ptr m_memory = nullptr; size_t m_offset = 0u; diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index e670dc90d8..cee9d3c081 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -1,7 +1,6 @@ #ifndef __NBL_E_API_TYPE_H_INCLUDED__ #define __NBL_E_API_TYPE_H_INCLUDED__ -#include "nbl/core/declarations.h" #include namespace nbl::video diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 00e55a66e3..d18c56cd0b 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -24,6 +24,7 @@ We only support persistently mapped buffers with ARB_buffer_storage. Please don't ask us to support Buffer Orphaning. */ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { + public: //! Access flags for how the application plans to use mapped memory (if any) /** When you create the memory you can allow for it to be mapped (be given a pointer) @@ -61,13 +62,30 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted //EMPF_RDMA_CAPABLE_BIT_NV = 0x00000200, }; // - enum E_MEMORY_HEAP_FLAGS : uint32_t + enum E_MEMORY_HEAP_FLAGS : uint8_t { EMHF_NONE = 0, EMHF_DEVICE_LOCAL_BIT = 0x00000001, EMHF_MULTI_INSTANCE_BIT = 0x00000002, }; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint16_t + { + EHT_NONE = 0, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D11_TEXTURE = 0x00000008, + EHT_D3D11_TEXTURE_KMT = 0x00000010, + EHT_D3D12_HEAP = 0x00000020, + EHT_D3D12_RESOURCE = 0x00000040, + EHT_DMA_BUF = 0x00000080, + EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, + EHT_SCI_BUF_NV = 0x00002000, + EHT_SCREEN_BUFFER_QNX = 0x00004000, + }; + // const ILogicalDevice* getOriginDevice() const {return m_originDevice;} @@ -75,26 +93,30 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted E_API_TYPE getAPIType() const; //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource. - inline bool isDedicated() const {return m_dedicated;} + [[deprecated]] + inline bool isDedicated() const {return m_params.dedicated;} //! Returns the size of the memory allocation - inline size_t getAllocationSize() const {return m_allocationSize;} + [[deprecated]] + inline size_t getAllocationSize() const {return m_params.allocationSize;} //! - inline core::bitflag getAllocateFlags() const { return m_allocateFlags; } + [[deprecated]] + inline core::bitflag getAllocateFlags() const { return m_params.allocateFlags; } //! - inline core::bitflag getMemoryPropertyFlags() const { return m_memoryPropertyFlags; } + [[deprecated]] + inline core::bitflag getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr) - inline bool isMappable() const {return m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)||m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} + inline bool isMappable() const {return m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)|| m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} //! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor /** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more) for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */ // TODO: Visible is a misnomer, collides with Vulkan memory model nomenclature where visibility only concerns reads, where as this is both read and write (visibility and availability) inline bool haveToMakeVisible() const { - return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); + return !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); } //! @@ -110,9 +132,9 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { if (isCurrentlyMapped()) return nullptr; - if(accessHint.hasFlags(EMCAF_READ) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) + if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) return nullptr; - if(accessHint.hasFlags(EMCAF_WRITE) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) + if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) return nullptr; m_mappedPtr = reinterpret_cast(map_impl(range,accessHint)); if (m_mappedPtr) @@ -153,29 +175,51 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Constant variant of getMappedPointer inline const void* getMappedPointer() const { return m_mappedPtr; } + struct SInfo + { + uint64_t allocationSize = 0; + core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; + // Handle Type for external resources + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + //! Imports the given handle if importHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if importHandle == nullptr && externalHandleType != EHT_NONE + // Note:: Closing importHandle is not the responsibility of this class + system::external_handle_t importHandle = 0; + }; + + struct SCreationParams: SInfo + { + core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; + bool dedicated = false; + }; + + inline const SCreationParams& getCreationParams() const { return m_params; } + + virtual system::external_handle_t getExportHandle() const = 0; + protected: - inline IDeviceMemoryAllocation( - const ILogicalDevice* const originDevice, const size_t _size, const core::bitflag allocateFlags, const core::bitflag memoryPropertyFlags, const bool dedicated - ) : m_originDevice(originDevice), m_allocationSize(_size), m_allocateFlags(allocateFlags), m_memoryPropertyFlags(memoryPropertyFlags), m_dedicated(dedicated) {} + + IDeviceMemoryAllocation( + const ILogicalDevice* originDevice, SCreationParams&& params = {}) + : m_originDevice(originDevice) + , m_params(std::move(params)) + , m_mappedPtr(nullptr) + , m_mappedRange{ 0, 0 } + , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS) + {} virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; virtual bool unmap_impl() = 0; - - const ILogicalDevice* const m_originDevice; - const size_t m_allocationSize; + const ILogicalDevice* m_originDevice = nullptr; + SCreationParams m_params = {}; uint8_t* m_mappedPtr = nullptr; MemoryRange m_mappedRange = {}; core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - const core::bitflag m_allocateFlags; - const core::bitflag m_memoryPropertyFlags; - const bool m_dedicated; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) } // end namespace nbl::video -#endif - - +#endif \ No newline at end of file diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index e85eec12a0..94e112a76a 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -15,13 +15,18 @@ class NBL_API2 IDeviceMemoryAllocator // right now we only support this interface handing out memory for one device or group virtual ILogicalDevice* getDeviceForAllocations() const = 0; - struct SAllocateInfo + struct SAllocateInfo : IDeviceMemoryAllocation::SInfo { - size_t size : 54 = 0ull; - size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS - size_t memoryTypeIndex : 5 = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. + uint8_t memoryTypeIndex = 0u; + }; + + struct SAllocateParams { + IDeviceMemoryBacked* dedication = nullptr; + const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + system::external_handle_t externalHandle = system::ExternalHandleNull; }; struct SAllocation @@ -45,8 +50,13 @@ class NBL_API2 IDeviceMemoryAllocator class IMemoryTypeIterator { public: - IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) - : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs) {} + IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) : + m_allocateFlags(static_cast(allocateFlags.value)), + m_reqs(reqs), + m_handleType(handleType) + {} static inline uint32_t end() {return 32u;} @@ -56,13 +66,15 @@ class NBL_API2 IDeviceMemoryAllocator return *this; } - inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) + inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication, system::external_handle_t external_handle) { SAllocateInfo ret; - ret.size = m_reqs.size; - ret.flags = m_allocateFlags; + ret.allocationSize = m_reqs.size; + ret.allocateFlags = core::bitflag(m_allocateFlags); ret.memoryTypeIndex = dereference(); ret.dedication = dedication; + ret.externalHandleType = m_handleType; + ret.importHandle = external_handle; return ret; } @@ -75,13 +87,19 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB class DefaultMemoryTypeIterator : public IMemoryTypeIterator { public: - DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) : IMemoryTypeIterator(reqs, allocateFlags) + DefaultMemoryTypeIterator( + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType + ) : + IMemoryTypeIterator(reqs, allocateFlags, handleType) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } @@ -105,13 +123,14 @@ class NBL_API2 IDeviceMemoryAllocator }; template + // TODO(kevinyu) : Fix all example_tests if this api change to use SAllocateParams is approved inline SAllocation allocate( - const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, IDeviceMemoryBacked* dedication=nullptr, - const core::bitflag allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE) + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + const SAllocateParams& params) { - for(memory_type_iterator_t memTypeIt(reqs, allocateFlags); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, params.allocateFlags, params.externalHandleType); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { - SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); + SAllocateInfo allocateInfo = memTypeIt.operator()(params.dedication, params.externalHandle); auto allocation = allocate(allocateInfo); if (allocation.isValid()) return allocation; diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index b0c0ce05ed..04693456d7 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -39,6 +39,8 @@ class IDeviceMemoryBacked : public IBackendObject // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects bool skipHandleDestroy = false; + core::bitflag externalHandleTypes = IDeviceMemoryAllocation::EHT_NONE; + //! If you specify multiple queue family indices, then you're concurrent sharing inline bool isConcurrentSharing() const { diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 742cb506c6..5ebf7ccf1a 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -162,7 +162,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe IQueue::RESULT waitIdle(); //! Semaphore Stuff + [[deprecated]] virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) = 0; + virtual core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&& creationParams = {}) = 0; // Waits for max timeout amout of time for the semaphores to reach a specific counter value // DOES NOT implicitly trigger Queue-refcount-resource release because of two reasons: // - the events may trigger loads of resource releases causing extra processing, whereas our `timeout` could be quite small @@ -331,39 +333,11 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams) - { - const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; - if (creationParams.size>maxSize) - { - m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit (%u)!",system::ILogger::ELL_ERROR,creationParams.size,this,maxSize); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Buffer, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - return createBuffer_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) - inline core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams) - { - if (!IGPUImage::validateCreationParameters(creationParams)) - { - m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs - return createImage_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams); // Create an ImageView that can actually be used by shaders (@see ICPUImageView) inline core::smart_refctd_ptr createImageView(IGPUImageView::SCreationParams&& params) { @@ -1132,9 +1106,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual bool bindBufferMemory_impl(const uint32_t count, const SBindBufferMemoryInfo* pInfos) = 0; virtual bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) = 0; - virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) = 0; - virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) = 0; + virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 4222a22153..3fdeff0b2c 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -639,6 +639,57 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return std::span(m_initData.qfamProperties->data(),m_initData.qfamProperties->data()+m_initData.qfamProperties->size()); } + enum E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t + { + EEMF_NONE = 0x0, + EEMF_DEDICATED_ONLY_BIT = 0x1, + EEMF_EXPORTABLE_BIT = 0x2, + EEMF_IMPORTABLE_BIT = 0x4, + }; + + struct SExternalMemoryProperties + { + // Need 15 bit to store all possible value of E_EXTERNAL_HANDLE_TYPE. So bitfield will not save any space. + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE exportableTypes; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE compatibleTypes; + E_EXTERNAL_MEMORY_FEATURE_FLAGS features : 3; + bool operator == (SExternalMemoryProperties const& rhs) const = default; + }; + + SExternalMemoryProperties getExternalBufferProperties( + core::bitflag usages, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + usages &= ~asset::IBuffer::EUF_SYNTHETIC_FLAGS_MASK; // mask out synthetic flags + + // TODO(kevinyu): Should we cached the properties like Atil does. If yes, needs mutex and mutable specifier. Class become not that simple anymore. + // { + // std::shared_lock lock(m_externalBufferPropertiesMutex); + // auto it = m_externalBufferProperties.find({ usage, handleType }); + // if (it != m_externalBufferProperties.end()) + // return it->second; + // } + // + // std::unique_lock lock(m_externalBufferPropertiesMutex); + // return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType); + return getExternalMemoryProperties_impl(usages, handleType); + } + + struct SImageFormatInfo + { + asset::E_FORMAT format; + IGPUImage::E_TYPE type; + IGPUImage::TILING tiling; + core::bitflag usage; + core::bitflag flags; + }; + SExternalMemoryProperties getExternalImageProperties( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + return getExternalMemoryProperties_impl(info, handleType); + } + struct SBufferFormatPromotionRequest { asset::E_FORMAT originalFormat = asset::EF_UNKNOWN; SFormatBufferUsages::SUsage usages = SFormatBufferUsages::SUsage(); @@ -683,6 +734,10 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable }; inline IPhysicalDevice(SInitData&& _initData) : m_initData(std::move(_initData)) {} + // External memory properties query + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + // ILogicalDevice creation bool validateLogicalDeviceCreation(const ILogicalDevice::SCreationParams& params) const; virtual core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) = 0; diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index d4fbdd1756..54a92fb257 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -15,6 +15,30 @@ namespace nbl::video class ISemaphore : public IBackendObject { public: + + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0x00000000, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D12_FENCE = 0x00000008, + EHT_SYNC_FD = 0x00000010, + }; + + //! + struct SCachedCreationParams + { + // Handle Type for external resources + core::bitflag externalHandleTypes = EHT_NONE; + }; + + struct SCreationParams : SCachedCreationParams + { + uint64_t initialValue; + }; + // basically a pool function virtual uint64_t getCounterValue() const = 0; @@ -146,9 +170,18 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; + virtual system::external_handle_t getExportHandle() const = 0; + + const SCachedCreationParams& getCreationParams() const { return m_creationParams; } + + + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev) : IBackendObject(std::move(dev)) {} + inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : + IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} virtual ~ISemaphore() = default; + + SCachedCreationParams m_creationParams; }; } diff --git a/include/nbl/video/declarations.h b/include/nbl/video/declarations.h index 37f2f864bf..1a74514714 100644 --- a/include/nbl/video/declarations.h +++ b/include/nbl/video/declarations.h @@ -24,9 +24,6 @@ #include "nbl/video/CVulkanImage.h" #include "nbl/video/surface/CSurfaceVulkan.h" -// CUDA -#include "nbl/video/CCUDAHandler.h" - // utilities #include "nbl/video/utilities/CDumbPresentationOracle.h" #include "nbl/video/utilities/ICommandPoolCache.h" @@ -39,9 +36,10 @@ #include "nbl/video/utilities/CSmoothResizeSurface.h" #include "nbl/video/utilities/CDefaultSwapchainFramebuffers.h" #include "nbl/video/utilities/CAssetConverter.h" +#include "nbl/video/CUDAInterop.h" //VT //#include "nbl/video/IGPUVirtualTexture.h" -#endif \ No newline at end of file +#endif diff --git a/include/nbl/video/utilities/IUtilities.h b/include/nbl/video/utilities/IUtilities.h index f52d5d36ef..7ab885572a 100644 --- a/include/nbl/video/utilities/IUtilities.h +++ b/include/nbl/video/utilities/IUtilities.h @@ -108,7 +108,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted auto reqs = buffer->getMemoryReqs(); reqs.memoryTypeBits &= physicalDevice->getDownStreamingMemoryTypeBits(); - auto deviceMemAllocation = device->allocate(reqs, buffer.get(), allocateFlags); + auto deviceMemAllocation = device->allocate(reqs, { buffer.get(), allocateFlags }); if (!deviceMemAllocation.isValid()) { @@ -143,7 +143,7 @@ class NBL_API2 IUtilities : public core::IReferenceCounted auto reqs = buffer->getMemoryReqs(); reqs.memoryTypeBits &= physicalDevice->getUpStreamingMemoryTypeBits(); - auto deviceMemAllocation = device->allocate(reqs, buffer.get(), allocateFlags); + auto deviceMemAllocation = device->allocate(reqs, { buffer.get(), allocateFlags }); if (!deviceMemAllocation.isValid()) { diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 9c994bfa41..317cf3d2a1 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -95,12 +95,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (NBL_COMPILE_WITH_CUDA) - message(STATUS "Building with CUDA interop") - set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) - if (NBL_BUILD_OPTIX) - set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) - endif() +if (NBL_BUILD_OPTIX) + set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() # => TODO: clean! @@ -128,6 +124,15 @@ set(NBL_CORE_SOURCES core/alloc/refctd_memory_resource.cpp core/hash/blake.cpp ) + +set(NBL_CUDA_INTEROP_SOURCES + video/CCUDADevice.cpp + video/CCUDAExportableMemory.cpp + video/CCUDAHandler.cpp + video/CCUDAImportedMemory.cpp + video/CCUDAImportedSemaphore.cpp +) + set(NBL_SYSTEM_SOURCES system/DefaultFuncPtrLoader.cpp system/IFileBase.cpp @@ -291,9 +296,6 @@ set(NBL_VIDEO_SOURCES video/CVulkanEvent.cpp video/CSurfaceVulkan.cpp -# CUDA - video/CCUDAHandler.cpp - video/CCUDADevice.cpp ) set(NBL_SCENE_SOURCES @@ -312,6 +314,7 @@ set(NABLA_SRCS_COMMON ${NBL_VIDEO_SOURCES} ${NBL_SCENE_SOURCES} ${NBL_META_SOURCES} + ${NBL_CUDA_INTEROP_SOURCES} ) if(MSVC) @@ -422,6 +425,11 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() +if(NBL_COMPILE_WITH_CUDA) + target_compile_definitions(Nabla PRIVATE _NBL_COMPILE_WITH_CUDA_) + target_include_directories(Nabla PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) +endif() + set(INTERFACE_BUILD_DEFINITIONS _DXC_DLL_="${DXC_DLL}" ) @@ -657,11 +665,6 @@ target_link_libraries(Nabla PRIVATE volk) # volk is part of public interface headers in Nabla target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) -# CUDA -if (NBL_COMPILE_WITH_CUDA) - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") -endif() - list(APPEND PUBLIC_BUILD_INCLUDE_DIRS # this should be PRIVATE, but things from /src (or /source) are sometimes included in things in /include and so examples have to put source dirs into theirs Include Path # -> TODO @@ -774,8 +777,11 @@ if(TARGET ngfx) ) endif() -# on MSVC it won't compile without this option! -target_compile_options(Nabla PUBLIC $<$:/bigobj>) +# on MSVC it won't compile without these options! +target_compile_options(Nabla PUBLIC + $<$:/bigobj> + $<$:/Zc:preprocessor> +) if(NBL_PCH) target_precompile_headers(Nabla @@ -786,11 +792,24 @@ if(NBL_PCH) ) endif() -# extensions start_tracking_variables_for_propagation_to_parent() add_subdirectory(ext EXCLUDE_FROM_ALL) propagate_changed_variables_to_parent_scope() +if(DEFINED NBL_EXT_CUDA_INTEROP_LIB AND TARGET ${NBL_EXT_CUDA_INTEROP_LIB}) + if(NBL_ENABLE_CONFIG_INSTALL AND NOT NBL_STATIC_BUILD) + install(TARGETS ${NBL_EXT_CUDA_INTEROP_LIB} + EXPORT NablaCUDAInteropExportTargets + COMPONENT Libraries + ) + install(EXPORT NablaCUDAInteropExportTargets + NAMESPACE Nabla:: + DESTINATION cmake + COMPONENT Libraries + ) + endif() +endif() + if(TARGET ${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB}) set_target_properties(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} PROPERTIES EXCLUDE_FROM_ALL OFF) nbl_install_lib_spec(${NBL_EXT_FULL_SCREEN_TRIANGLE_LIB} "nbl/ext/FULL_SCREEN_TRIANGLE") diff --git a/src/nbl/ext/CMakeLists.txt b/src/nbl/ext/CMakeLists.txt index f3b55531c2..264cfc7c2d 100644 --- a/src/nbl/ext/CMakeLists.txt +++ b/src/nbl/ext/CMakeLists.txt @@ -38,6 +38,14 @@ if (NBL_BUILD_OPTIX) ) endif() +add_subdirectory(CUDAInterop) +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_CUDA_INTEROP_LIB + ${NBL_EXT_CUDA_INTEROP_LIB} + PARENT_SCOPE + ) +endif() + if (NBL_BUILD_IMGUI) add_subdirectory(ImGui) set(NBL_EXT_IMGUI_UI_INCLUDE_DIRS diff --git a/src/nbl/ext/CUDAInterop/CMakeLists.txt b/src/nbl/ext/CUDAInterop/CMakeLists.txt new file mode 100644 index 0000000000..a9e1663fa9 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/CMakeLists.txt @@ -0,0 +1,26 @@ +include(common) +include(NablaCUDAInteropHelpers) + +if (NBL_COMPILE_WITH_CUDA) + set(NBL_EXT_CUDA_INTEROP_LIB "NblExtCUDA_INTEROP") + + file(GLOB NBL_EXT_CUDA_INTEROP_IDE_HEADERS CONFIGURE_DEPENDS "${NBL_ROOT_PATH}/include/nbl/ext/CUDAInterop/*.h") + set(NBL_EXT_CUDA_INTEROP_IDE_SOURCES + ${NBL_EXT_CUDA_INTEROP_IDE_HEADERS} + CMakeLists.txt + README.md + ) + set_source_files_properties(${NBL_EXT_CUDA_INTEROP_IDE_SOURCES} PROPERTIES HEADER_FILE_ONLY TRUE) + + # Header-only opt-in target. It builds no artifact and adds CUDA SDK usage requirements only for native interop consumers. + add_library(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE EXCLUDE_FROM_ALL ${NBL_EXT_CUDA_INTEROP_IDE_SOURCES}) + target_link_libraries(${NBL_EXT_CUDA_INTEROP_LIB} INTERFACE + Nabla + CUDA::toolkit + ) + set_target_properties(${NBL_EXT_CUDA_INTEROP_LIB} PROPERTIES EXPORT_NAME "ext::CUDAInterop") + add_library(Nabla::ext::CUDAInterop ALIAS ${NBL_EXT_CUDA_INTEROP_LIB}) + set(NBL_EXT_CUDA_INTEROP_LIB "${NBL_EXT_CUDA_INTEROP_LIB}" PARENT_SCOPE) +endif() + +add_subdirectory(smoke) diff --git a/src/nbl/ext/CUDAInterop/README.md b/src/nbl/ext/CUDAInterop/README.md new file mode 100644 index 0000000000..0d8ebe2f08 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/README.md @@ -0,0 +1,182 @@ +# CUDA Interop + +## Layout + +- `Nabla::Nabla` owns the SDK-free CUDA interop API in `nbl/video/CCUDA*.h` and the implementation in `src/nbl/video/CCUDA*.cpp`. +- The public Nabla headers do not include `cuda.h`, `nvrtc.h`, or other CUDA SDK headers. A consumer that only links `Nabla::Nabla` does not need a CUDA SDK install just to parse Nabla headers. +- CUDA native state is stored behind incomplete `SNativeState` members in Nabla classes. Public headers expose fixed-layout opaque value handles from `nbl/video/CUDAInteropHandles.h`. +- `Nabla::ext::CUDAInterop` is an `INTERFACE` target. It builds no artifact. It only adds the SDK opt-in header, `CUDA::toolkit`, and runtime-header discovery setup to targets that ask for raw CUDA interop. +- `nbl/video/CUDAInteropNativeAPI.h` is the low-level SDK boundary used by Nabla's CUDA implementation and by opt-in consumers. It declares the dynamic CUDA/NVRTC tables and binds SDK-free opaque handles to CUDA/NVRTC SDK types. +- `nbl/ext/CUDAInterop/CUDAInteropNative.h` is the public opt-in entrypoint. It includes the native API header so SDK-typed code can use CUDA/NVRTC handles directly with Nabla interop methods. + +## CMake Usage + +`Nabla::Nabla`-only usage stays SDK-free: + +```cmake +find_package(Nabla CONFIG REQUIRED) +target_link_libraries(app PRIVATE Nabla::Nabla) +``` + +SDK-typed CUDA interop is explicit: + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE) +``` + +`nbl_target_link_cuda_interop` links `Nabla::ext::CUDAInterop` and writes `nbl_cuda_interop_runtime.json` next to the target executable during CMake generation. + +Optional overrides: + +```cmake +find_package(Nabla CONFIG REQUIRED COMPONENTS CUDAInterop) +nbl_target_link_cuda_interop(native_app PRIVATE + INCLUDE_DIRS "${cuda_runtime_headers}" +) + +nbl_target_link_cuda_interop(native_app PRIVATE + RUNTIME_JSON "${CMAKE_CURRENT_BINARY_DIR}/$/my_cuda_runtime.json" +) +``` + +Consumers can also choose the SDK used for SDK-typed compilation with: + +```cmake +cmake -S . -B build -DNabla_CUDA_TOOLKIT_ROOT= +``` + +This affects SDK opt-in compilation and generated runtime header discovery only. It does not rebuild Nabla and does not change the `Nabla.dll` ABI. + +## SDK Opt-In Usage + +```cpp +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" + +auto handler = nbl::video::CCUDAHandler::create(system, std::move(logger)); +auto cudaDevice = handler->createDevice(std::move(vulkanConnection), physicalDevice); + +const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); +if (!exactBuildSDK) +{ + // Warn here, or return false if this application requires exact same-SDK policy. +} + +auto memory = cudaDevice->createExportableMemory({ + .size = size, + .alignment = alignment, + .locationType = CU_MEM_LOCATION_TYPE_DEVICE, +}); + +auto& cu = handler->getCUDAFunctionTable(); +auto& nvrtc = handler->getNVRTCFunctionTable(); +int driverVersion = 0; +NBL_CUDA_INTEROP_ASSERT_SUCCESS(cu.pcuDriverGetVersion(&driverVersion), handler); + +CUdeviceptr mapped = 0; +if (importedMemory) + importedMemory->getMappedBuffer(mapped); + +CUdeviceptr exported = memory->getDeviceptr(); + +nvrtcProgram program = nullptr; +auto createResult = handler->createProgram(program, std::string(cudaSource), "kernel.cu"); + +std::string log; +auto compile = handler->compileDirectlyToPTX( + std::move(cudaSource), + "kernel.cu", + cudaDevice->geDefaultCompileOptions(), + &log +); +``` + +SDK opt-in access is not a full CUDA wrapper. It is the glue between Nabla resource lifetime and raw CUDA interop: + +- `CCUDAHandler::getCUDAFunctionTable` and `CCUDAHandler::getNVRTCFunctionTable` expose the loaded Driver API and NVRTC tables after SDK opt-in. +- The shipped tables contain the CUDA/NVRTC calls used and tested by Nabla. SDK opt-in code can load extra symbols from the same dynamic table without changing Nabla's ABI. The symbol name must be declared by the CUDA SDK headers visible to that translation unit: + +```cpp +auto pcuNewCall = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuNewCall); +if (pcuNewCall) + pcuNewCall(...); +``` + +- `cuda_interop::SCU*`, `SCUresult`, `SNVRTCResult`, and `SNVRTCProgram` are SDK-free opaque values in Nabla headers. After including `CUDAInteropNative.h`, they become constructible from and convertible to matching CUDA/NVRTC SDK types such as `CUdeviceptr`, `CUexternalSemaphore`, `CUresult`, `nvrtcResult`, and `nvrtcProgram`. +- CUDA enum values can be passed to SDK-free Nabla methods such as `CCUDADevice::createExportableMemory` and `CCUDADevice::roundToGranularity`. Nabla stores them as integer values in its public ABI. +- SDK-free output parameters use `cuda_interop::SOutput<...>`. SDK-free code can pass opaque `SCU*` values or pointers. SDK opt-in code can pass matching native CUDA/NVRTC output variables directly, for example `CUdeviceptr mapped; importedMemory->getMappedBuffer(mapped);` or `nvrtcProgram program; handler->createProgram(program, ...)`. +- `CCUDAHandler::compileProgram`, `getProgramLog`, `getPTX`, and `compileDirectlyToPTX` are SDK-free Nabla methods. SDK opt-in code can use their results with native `nvrtcProgram` / `nvrtcResult` because the opaque conversions are enabled by `CUDAInteropNative.h`. +- `NBL_CUDA_INTEROP_ASSERT_SUCCESS(expr, handler)` is available for call sites that intentionally assert on CUDA/NVRTC failures. Pass a pointer-like `CCUDAHandler` handle. Nabla implementation code should still prefer explicit error handling and clean returns. +- `cuda_native::isBuildCUDASDKVersionExactMatch()` checks exact SDK version equality between the consumer translation unit and the SDK used to build Nabla's interop implementation. It is a policy helper, not an automatic runtime rejection rule. + +Smoke examples: + +- `src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp` checks that `Nabla::Nabla` headers stay SDK-free. +- `src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp` checks SDK opt-in, runtime header discovery, `cuda_fp16.h`, NVRTC, extra dynamic symbol loading, and raw interop usage. + +## ABI + +- `CCUDAHandler`, `CCUDADevice`, `CCUDAExportableMemory`, `CCUDAImportedMemory`, and `CCUDAImportedSemaphore` are exported from `Nabla.dll` through the normal Nabla ABI. +- Their public declarations do not expose CUDA SDK structs, CUDA SDK layouts, or `cuda.h` / `nvrtc.h` includes. +- Opaque handle types are small trivially-copyable byte arrays with fixed size/alignment chosen to match CUDA SDK handle storage. The SDK opt-in header validates this with `static_assert`s against the SDK used by the consumer. +- CUDA implementation state is owned by Nabla through private `SNativeState` members. Consumers cannot construct CUDA wrapper objects with arbitrary internal CUDA state. +- SDK-sized arrays, CUDA enum storage, and CUDA implementation state stay private to Nabla. +- A consumer can build SDK opt-in code with its own compatible SDK independently from the SDK used to build Nabla. SDK-typed code can check `cuda_native::isBuildCUDASDKVersionExactMatch()` when exact CUDA SDK version matching is required. +- Runtime include-option construction is header-only and is not part of the exported ABI. +- The loaded CUDA driver and NVRTC runtime are validated at runtime. + +## Runtime Header Discovery + +NVRTC may need CUDA runtime headers when user kernels include files such as `cuda_fp16.h`, `vector_types.h`, or `cuda_runtime_api.h`. This is a runtime concern of applications that compile CUDA source with NVRTC, not a `Nabla::Nabla` package requirement. + +- `nbl_target_link_cuda_interop` generates `nbl_cuda_interop_runtime.json` for the target that opted into SDK-typed CUDA interop. +- The JSON is a build artifact. Nabla packages do not install host-specific CUDA paths. +- Package consumers generate their own JSON when they call `nbl_target_link_cuda_interop`. +- `NBL_CUDA_INTEROP_RUNTIME_JSON` can point runtime discovery at custom JSON files without rebuilding the application. +- Runtime lookup checks explicit JSON paths first, then executable-local JSON, app-local header bundles, explicit include-dir environment variables, `CUDA_PATH` style toolkit roots, Python/conda package layouts, and common system install roots. +- Runtime lookup records the source of every accepted include root and parses `CUDA_VERSION` from `cuda.h` when available. The startup report prints the primary include root, its source, its parsed CUDA version, and the full search order. +- The first include root is not required to match the SDK used to build Nabla. It is the first `-I` path visible to NVRTC, so the first path containing a requested header wins just like normal C/C++ include search. +- If the primary runtime header root is incomplete or reports a different CUDA version than the loaded NVRTC runtime, Nabla logs a warning. This is diagnostic policy, not an automatic hard failure. +- The probe looks for directories that contain CUDA runtime headers. It does not hardcode a CUDA major version in app-local paths. +- `CCUDAHandler` captures discovered include directories when it is created. `CCUDAHandler::compileDirectlyToPTX` reuses those exact include options, so the startup report matches the NVRTC search paths used by that handler. + +Production machines do not need the full CUDA SDK just because Nabla was built with CUDA. Applications that use NVRTC with CUDA runtime headers can provide those headers through generated JSON, a custom JSON path, an app-local bundle, an official runtime/header package, or an installed toolkit. + +Nabla could ship an app-local bundle of selected CUDA runtime headers and make it available to runtime discovery. That model is allowed by the NVIDIA CUDA EULA for the components listed in Attachment A. Nabla intentionally does not bundle these headers. Because of that, end users should prefer an official CUDA runtime/header package for production machines. An installed toolkit also works, but the full toolkit is mainly for developers compiling Nabla or SDK-typed CUDA code. + +NVIDIA CUDA EULA allows redistribution only for selected components. The distribution section says: "The portions of the SDK that are distributable under the Agreement are listed in Attachment A." Attachment A says: "The following CUDA Toolkit files may be distributed with applications developed by you." See: + +- https://docs.nvidia.com/cuda/eula/#distribution +- https://docs.nvidia.com/cuda/eula/#attachment-a + +This means the Attachment A header groups below can be redistributed with applications under the EULA terms. It does not mean the full CUDA SDK can be redistributed. Applications that need NVRTC runtime compilation can decide whether to ship the allowed headers, depend on an official runtime/header package, or point discovery at an installed toolkit/header package. + +Attachment A lists header groups relevant to NVRTC runtime compilation: + +- NVIDIA Runtime Compilation Library and Header: `nvrtc.h` +- CUDA Floating Point Type Headers: `cuda_fp16.h`, `cuda_fp16.hpp`, `cuda_bf16.h`, `cuda_bf16.hpp`, `cuda_fp8.h`, `cuda_fp8.hpp`, `cuda_fp6.h`, `cuda_fp6.hpp`, `cuda_fp4.h`, `cuda_fp4.hpp` +- CUDA Headers for Runtime Compilation: `crt/host_defines.h`, `cuComplex.h`, `cuda_awbarrier_helpers.h`, `cuda_awbarrier_primitives.h`, `cuda_awbarrier.h`, `cuda_pipeline_helpers.h`, `cuda_pipeline_primitives.h`, `cuda_pipeline.h`, `cuda_runtime_api.h`, `cuda.h`, `cuda/std/tuple`, `cuda/std/type_traits`, `cuda/std/utility`, `device_types.h`, `vector_functions.h`, and `vector_types.h` + +CuPy documents the same NVRTC issue for CUDA 12.2+. Their install docs say: "On CUDA 12.2 or later, CUDA Runtime header files are required to compile kernels in CuPy." They show the common `vector_types.h` failure and recommend CUDA runtime header packages for PyPI/system package installs: + +- https://docs.cupy.dev/en/v13.5.0/install.html#cupy-always-raises-nvrtc-error-compilation-6 +- https://github.com/cupy/cupy/issues/8466 + +## CUDA ON/OFF Builds + +- SDK-free public headers stay stable for CUDA ON and CUDA OFF Nabla builds. +- Nabla implementation `.cpp` files include CUDA SDK headers only behind `_NBL_COMPILE_WITH_CUDA_`. +- CUDA OFF implementations are local stubs in the same `.cpp` files. Factory/import/export paths return `nullptr` for unavailable CUDA features instead of producing unresolved symbols. +- The Nabla source list stays stable, so CUDA interop `.cpp` files remain visible in IDE projects for both CUDA ON and CUDA OFF builds. + +## Related Designs + +The split follows the same boundary pattern used by mature GPU projects: public/common headers avoid vendor SDK requirements, vendor SDK access is explicit, and implementation details stay outside the public API. + +- OpenCV keeps common CUDA-facing headers independent from CUDA Runtime API and exposes raw `cudaStream_t` / `cudaEvent_t` through a separate accessor header: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/cuda_stream_accessor.hpp#L50-L79 +- OpenCV keeps CUDA implementation headers private and includes `cuda.h`, `cuda_runtime.h`, and NPP there: https://github.com/opencv/opencv/blob/808d2d596c475d95fedb6025c9ed425d62bba04c/modules/core/include/opencv2/core/private.cuda.hpp#L47-L61 +- Blender/Cycles exposes a CUDA device boundary without CUDA SDK headers in the boundary header: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device.h#L7-L27 +- Blender/Cycles keeps `CUdevice`, `CUcontext`, `cuda.h`, and `cuew.h` in the CUDA implementation header/source: https://github.com/blender/blender/blob/794c527e8595a9f448e0143a217d0ceb648c5e7e/intern/cycles/device/cuda/device_impl.h#L12-L30 +- OpenMM keeps the CUDA platform boundary on OpenMM types/properties in `CudaPlatform.h`, while `CudaContext.h` is the CUDA-specific low-level header that includes CUDA SDK headers and exposes `CUmodule` / `CUfunction`: https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaPlatform.h#L48-L120 and https://github.com/openmm/openmm/blob/master/platforms/cuda/include/CudaContext.h#L32-L52 +- GROMACS gates CUDA source handling behind `GMX_GPU_CUDA` in the library build and keeps CUDA runtime types in internal GPU utility headers: https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/CMakeLists.txt#L339-L367 and https://gitlab.com/gromacs/gromacs/-/blob/main/src/gromacs/gpu_utils/gputraits.cuh#L44-L58 +- ONNX Runtime keeps the public C API provider-neutral and routes CUDA through provider-specific bridge/factory code: https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_c_api.h#L1-L80 and https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/session/provider_bridge_ort.cc#L110-L150 diff --git a/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt new file mode 100644 index 0000000000..e16d3feac0 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.30) +project(NblExtCUDAInteropSmoke CXX) + +option(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE "Build the CUDA native opt-in smoke from an installed Nabla package." OFF) +set(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON "" CACHE FILEPATH "Optional CUDA interop runtime JSON path used by the native smoke.") + +if(NOT TARGET Nabla::Nabla) + set(_NBL_CUDA_INTEROP_SMOKE_COMPONENTS Core) + if(NBL_CUDA_INTEROP_SMOKE_WITH_NATIVE) + list(APPEND _NBL_CUDA_INTEROP_SMOKE_COMPONENTS CUDAInterop) + endif() + find_package(Nabla REQUIRED CONFIG COMPONENTS ${_NBL_CUDA_INTEROP_SMOKE_COMPONENTS}) +endif() + +enable_testing() + +function(nbl_add_cuda_interop_smoke TARGET_NAME SOURCE_FILE) + add_executable(${TARGET_NAME} ${SOURCE_FILE}) + target_compile_features(${TARGET_NAME} PRIVATE cxx_std_20) + + add_test(NAME ${TARGET_NAME} COMMAND $) +endfunction() + +nbl_add_cuda_interop_smoke(NblExtCUDAInteropPublicBoundarySmoke public_boundary.cpp) +target_link_libraries(NblExtCUDAInteropPublicBoundarySmoke PRIVATE Nabla::Nabla) + +if(TARGET Nabla::ext::CUDAInterop) + nbl_add_cuda_interop_smoke(NblExtCUDAInteropNativeOptInSmoke native_opt_in.cpp) + set(_nbl_cuda_interop_smoke_args PRIVATE) + if(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON) + list(APPEND _nbl_cuda_interop_smoke_args RUNTIME_JSON "${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + target_compile_definitions(NblExtCUDAInteropNativeOptInSmoke PRIVATE NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON="${NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}") + endif() + nbl_target_link_cuda_interop(NblExtCUDAInteropNativeOptInSmoke ${_nbl_cuda_interop_smoke_args}) +endif() diff --git a/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp new file mode 100644 index 0000000000..d1c15822cd --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/native_opt_in.cpp @@ -0,0 +1,192 @@ +#include "nbl/ext/CUDAInterop/CUDAInteropNative.h" +#include "nbl/system/IApplicationFramework.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef CUDA_VERSION +#error "Nabla::ext::CUDAInterop must expose CUDA SDK headers." +#endif + +namespace +{ +using namespace nbl; +using namespace nbl::video; + +[[maybe_unused]] bool compileVulkanCudaInteropRecipe( + CCUDADevice& cudaDevice, + ILogicalDevice* vulkanDevice, + core::smart_refctd_ptr vulkanMemory, + core::smart_refctd_ptr vulkanSemaphore) +{ + auto cudaMemory = cudaDevice.createExportableMemory({ + .size = 4096, + .alignment = 4096, + .locationType = CU_MEM_LOCATION_TYPE_DEVICE, + }); + if (!cudaMemory) + return false; + + auto exportedToVulkan = cudaMemory->exportAsMemory(vulkanDevice); + auto importedFromVulkan = cudaDevice.importExternalMemory(std::move(vulkanMemory)); + auto importedSemaphore = cudaDevice.importExternalSemaphore(std::move(vulkanSemaphore)); + + CUdeviceptr mappedVulkanMemory = 0; + if (importedFromVulkan) + importedFromVulkan->getMappedBuffer(mappedVulkanMemory); + + const CUdeviceptr cudaDevicePtr = cudaMemory->getDeviceptr(); + CUexternalSemaphore cudaSemaphore = nullptr; + if (importedSemaphore) + cudaSemaphore = importedSemaphore->getInternalObject(); + return exportedToVulkan.get() && mappedVulkanMemory && cudaDevicePtr && cudaSemaphore; +} + +bool cudaDriverRoundtrip(CCUDAHandler& handler, CUdevice device) +{ + auto& cuda = handler.getCUDAFunctionTable(); + + CUcontext context = nullptr; + if (cuda.pcuDevicePrimaryCtxRetain(&context, device)!=CUDA_SUCCESS) + return false; + + CUcontext poppedContext = nullptr; + bool contextPushed = false; + auto releaseContext = [&]() + { + if (context) + { + if (contextPushed) + cuda.pcuCtxPopCurrent_v2(&poppedContext); + cuda.pcuDevicePrimaryCtxRelease_v2(device); + } + }; + + if (cuda.pcuCtxPushCurrent_v2(context)!=CUDA_SUCCESS) + { + releaseContext(); + return false; + } + contextPushed = true; + + constexpr std::array input = {0x12345678u, 0x90abcdefu, 0xfedcba09u, 0x87654321u}; + std::array output = {}; + + CUdeviceptr deviceMemory = 0; + bool ok = cuda.pcuMemAlloc_v2(&deviceMemory, sizeof(input))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyHtoD_v2(deviceMemory,input.data(),sizeof(input))==CUDA_SUCCESS; + if (ok) + ok = cuda.pcuMemcpyDtoH_v2(output.data(),deviceMemory,sizeof(output))==CUDA_SUCCESS; + if (deviceMemory) + ok = cuda.pcuMemFree_v2(deviceMemory)==CUDA_SUCCESS && ok; + + releaseContext(); + return ok && std::ranges::equal(input, output); +} + +bool cudaFp16HeaderCompileProbe(CCUDAHandler& handler) +{ + constexpr const char* Source = R"cuda( + #include + extern "C" __global__ void fp16_probe(unsigned short* out) + { + out[0] = sizeof(__half); + } + )cuda"; + + std::string log; + auto compile = handler.compileDirectlyToPTX( + std::string(Source), + "cuda_fp16_discovery_probe.cu", + {nullptr,nullptr}, + &log, + 0, + nullptr, + nullptr + ); + return compile.result==NVRTC_SUCCESS && compile.ptx && compile.ptx->getSize()>0u; +} + +bool nativeNVRTCOutputProbe(CCUDAHandler& handler) +{ + constexpr const char* Source = R"cuda( + extern "C" __global__ void native_output_probe() {} + )cuda"; + + nvrtcProgram program = nullptr; + const auto result = handler.createProgram(program,std::string(Source),"native_output_probe.cu"); + if (program) + handler.getNVRTCFunctionTable().pnvrtcDestroyProgram(&program); + return result==NVRTC_SUCCESS; +} +} + +class CUDAInteropNativeOptInSmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + if (!isAPILoaded()) + return false; + + static_assert(nbl::video::cuda_interop::cuda_opaque_handle); + static_assert(nbl::video::cuda_interop::cuda_opaque_handle); + [[maybe_unused]] const bool exactBuildSDK = nbl::video::cuda_native::isBuildCUDASDKVersionExactMatch(); + + #ifdef NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON + const nbl::core::vector explicitIncludeDirs; + const nbl::core::vector runtimePathFiles = {NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON}; + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(explicitIncludeDirs, runtimePathFiles); + if (!std::filesystem::exists(NBL_CUDA_INTEROP_SMOKE_RUNTIME_JSON)) + return false; + #else + const auto runtimeEnvironment = nbl::video::cuda_interop::findRuntimeCompileEnvironment(); + #endif + const auto includeOptions = nbl::video::cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment); + const auto hasRuntimeHeaders = std::find_if(runtimeEnvironment.includeDirs.begin(),runtimeEnvironment.includeDirs.end(),[](const auto& includeDir) { + return std::filesystem::exists(includeDir/"cuda_fp16.h") || std::filesystem::exists(includeDir/"cuda_runtime_api.h"); + })!=runtimeEnvironment.includeDirs.end(); + if (includeOptions.empty() || !hasRuntimeHeaders) + return false; + + auto handler = nbl::video::CCUDAHandler::create(nullptr, nullptr); + if (!handler) + return true; + + auto pcuDriverGetVersion = NBL_SYSTEM_LOAD_DYNLIB_FUNCPTR(handler->getCUDAFunctionTable(), cuDriverGetVersion); + int loadedDriverVersion = 0; + if (!pcuDriverGetVersion || pcuDriverGetVersion(&loadedDriverVersion)!=CUDA_SUCCESS || loadedDriverVersion==0) + return false; + + if (!nativeNVRTCOutputProbe(*handler)) + return false; + + if (!cudaFp16HeaderCompileProbe(*handler)) + return false; + + int deviceCount = 0; + if (handler->getCUDAFunctionTable().pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount==0) + return true; + + CUdevice device = {}; + if (handler->getCUDAFunctionTable().pcuDeviceGet(&device,0)!=CUDA_SUCCESS) + return false; + + return cudaDriverRoundtrip(*handler, device); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +NBL_MAIN_FUNC(CUDAInteropNativeOptInSmoke) diff --git a/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp new file mode 100644 index 0000000000..73307599b1 --- /dev/null +++ b/src/nbl/ext/CUDAInterop/smoke/public_boundary.cpp @@ -0,0 +1,40 @@ +#include "nabla.h" + +#include "nbl/system/IApplicationFramework.h" +#include "nbl/video/CUDAInterop.h" +#include + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#error "Nabla consumers must not get the CUDA opt-in define." +#endif + +#ifdef CUDA_VERSION +#error "Nabla consumers must not include CUDA SDK headers." +#endif + +namespace +{ + +class CUDAInteropPublicBoundarySmoke final : public nbl::system::IApplicationFramework +{ + using base_t = nbl::system::IApplicationFramework; + +public: + using base_t::base_t; + + bool onAppInitialized(nbl::core::smart_refctd_ptr&&) override + { + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + static_assert(std::is_class_v); + return isAPILoaded(); + } + + void workLoopBody() override {} + bool keepRunning() override { return false; } +}; + +} + +NBL_MAIN_FUNC(CUDAInteropPublicBoundarySmoke) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 4d2e880095..edfd36844c 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -1,130 +1,368 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDADevice.h" +#include "nbl/video/CUDAInterop.h" -#ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { -CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture) - : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture) +CCUDADevice::E_VIRTUAL_ARCHITECTURE CCUDADevice::getVirtualArchitecture() const { - m_defaultCompileOptions.push_back("--std=c++14"); - m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); - m_defaultCompileOptions.push_back("-dc"); - m_defaultCompileOptions.push_back("-use_fast_math"); + return m_virtualArchitecture; +} + +core::SRange CCUDADevice::geDefaultCompileOptions() const +{ + return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; } +const CCUDAHandler* CCUDADevice::getHandler() const +{ + return m_handler.get(); +} + +} + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + +#ifdef _WIN32 +#include +#endif + +namespace nbl::video +{ + +namespace +{ + +constexpr const char* VirtualArchCompileOption[] = { + "-arch=compute_30", + "-arch=compute_32", + "-arch=compute_35", + "-arch=compute_37", + "-arch=compute_50", + "-arch=compute_52", + "-arch=compute_53", + "-arch=compute_60", + "-arch=compute_61", + "-arch=compute_62", + "-arch=compute_70", + "-arch=compute_72", + "-arch=compute_75", + "-arch=compute_80" +}; -#if 0 -CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink* link, uint32_t flags) +static_assert(sizeof(VirtualArchCompileOption)/sizeof(*VirtualArchCompileOption)==CCUDADevice::EVA_COUNT); + +static CUmemAllocationHandleType getAllocationHandleType() { - assert(link->obj); - auto glbuf = static_cast(link->obj.get()); - auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags); - if (retval!=CUDA_SUCCESS) - link->obj = nullptr; - return retval; +#ifdef _WIN32 + return CU_MEM_HANDLE_TYPE_WIN32; +#else + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif +} + } -CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink* link, uint32_t flags) + +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + std::unique_ptr&& nativeState, + core::smart_refctd_ptr&& handler) : + m_logger(vulkanDevice->getDebugCallback()->getLogger()), + m_defaultCompileOptions(), + m_vulkanConnection(std::move(vulkanConnection)), + m_virtualArchitecture(virtualArchitecture), + m_handler(std::move(handler)), + m_native(std::move(nativeState)) { - assert(link->obj); - - auto format = link->obj->getCreationParameters().format; - if (asset::isBlockCompressionFormat(format) || asset::isDepthOrStencilFormat(format) || asset::isScaledFormat(format) || asset::isPlanarFormat(format)) - return CUDA_ERROR_INVALID_IMAGE; + assert(m_native); + + m_defaultCompileOptions.push_back("--std=c++14"); + m_defaultCompileOptions.push_back(VirtualArchCompileOption[m_virtualArchitecture]); + m_defaultCompileOptions.push_back("-dc"); + m_defaultCompileOptions.push_back("-use_fast_math"); + m_defaultCompileOptions.push_back("-IC:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v13.2/include/"); - auto glimg = static_cast(link->obj.get()); - GLenum target = glimg->getOpenGLTarget(); - switch (target) + const auto& cu = m_handler->getCUDAFunctionTable(); + + if (!m_handler->defaultHandleResult(cu.pcuCtxCreate_v4(&m_native->context, nullptr, 0, m_native->handle))) + return; + if (!m_handler->defaultHandleResult(cu.pcuCtxSetCurrent(m_native->context))) + return; + + for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { - case GL_TEXTURE_2D: - case GL_TEXTURE_2D_ARRAY: - case GL_TEXTURE_CUBE_MAP: - case GL_TEXTURE_3D: - break; - default: - return CUDA_ERROR_INVALID_IMAGE; - break; +#ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; +#endif + + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = getAllocationHandleType(), + .location = { .type = static_cast(locationType), .id = m_native->handle }, +#ifdef _WIN32 + .win32HandleMetaData = &metadata, +#endif + }; + if (!m_handler->defaultHandleResult(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM))) + return; } - auto retval = cuda.pcuGraphicsGLRegisterImage(&link->cudaHandle,glimg->getOpenGLName(),target,flags); - if (retval != CUDA_SUCCESS) - link->obj = nullptr; - return retval; + m_valid = true; } +cuda_interop::SCUdevice CCUDADevice::getInternalObject() const +{ + return m_native->handle; +} -constexpr auto MaxAquireOps = 4096u; +cuda_interop::SCUcontext CCUDADevice::getContext() const +{ + return m_native->context; +} -CUresult CCUDAHandler::acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes) +static bool isDeviceLocal(CUmemLocationType location) { - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; + handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size)); + return err; + } + + CUmemAccessDesc accessDesc = { + .location = { .type = location, .id = nativeDevice }, + .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, + }; - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedPointer_v2(&iit->asBuffer.pointer,outbufferSizes ? sit:&tmp,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; + if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) + { + handler.defaultHandleResult(cu.pcuMemUnmap(ptr, size)); + handler.defaultHandleResult(cu.pcuMemAddressFree(ptr, size)); + return err; } + + *outPtr = ptr; + return CUDA_SUCCESS; } -CUresult CCUDAHandler::acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) + +core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&& inParams) { - if (linksBegin+MaxAquireOps(inParams.locationType); + + CCUDAExportableMemory::SCachedCreationParams params = { + .granularSize = roundToGranularity(inParams.locationType, inParams.size), + .deviceLocal = isDeviceLocal(location) + }; + if (params.granularSize==0u) + return nullptr; + + auto& cu = handler->getCUDAFunctionTable(); + +#ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; +#endif - CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream); - if (result != CUDA_SUCCESS) - return result; + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = getAllocationHandleType(), + .location = { .type = location, .id = m_native->handle }, +#ifdef _WIN32 + .win32HandleMetaData = &metadata, +#endif + }; + + auto nativeState = std::make_unique(); + + CUmemGenericAllocationHandle mem; + if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) + { + m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); + return nullptr; + } + + if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) + { + m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); + handler->defaultHandleResult(cu.pcuMemRelease(mem)); + return nullptr; + } - for (auto iit=linksBegin; iit!=linksEnd; iit++) + if (const auto err = reserveAddressAndMapMemory(*handler,m_native->handle,&nativeState->ptr, params.granularSize, inParams.alignment, location, mem); CUDA_SUCCESS != err) { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; + m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedMipmappedArray(&iit->asImage.mipmappedArray,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; + handler->defaultHandleResult(cu.pcuMemRelease(mem)); + + if (!system::CloseExternalHandle(params.externalHandle)) + m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); + + return nullptr; } - return CUDA_SUCCESS; + + if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) + { + handler->defaultHandleResult(err); + handler->defaultHandleResult(cu.pcuMemUnmap(nativeState->ptr, params.granularSize)); + handler->defaultHandleResult(cu.pcuMemAddressFree(nativeState->ptr, params.granularSize)); + if (!system::CloseExternalHandle(params.externalHandle)) + m_logger.log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); + return nullptr; + } + + return CCUDAExportableMemory::create(core::smart_refctd_ptr(this),std::move(params),std::move(nativeState)); } -CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream) + +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - if (linksBegin+MaxAquireOpsgetCUDAFunctionTable(); + const auto handleType = mem->getCreationParams().externalHandleType; - CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream); - if (result != CUDA_SUCCESS) - return result; + if (!handleType) return nullptr; - auto ait = arrayIndices; - auto mit = mipLevels; - for (auto iit=linksBegin; iit!=linksEnd; iit++,ait++,mit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; + const auto externalHandle = mem->getExportHandle(); + + CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; +#ifdef _WIN32 + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; + extMemDesc.handle.win32.handle = externalHandle; +#else + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD; + extMemDesc.handle.fd = externalHandle; +#endif + extMemDesc.size = mem->getAllocationSize(); - result = cuda::CCUDAHandler::cuda.pcuGraphicsSubResourceGetMappedArray(&iit->asImage.array,iit->cudaHandle,*ait,*mit); - if (result != CUDA_SUCCESS) - return result; + CUexternalMemory cuExtMem; + if (const auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + { + m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); + return nullptr; } - return CUDA_SUCCESS; + return core::smart_refctd_ptr( + new CCUDAImportedMemory(core::smart_refctd_ptr(this),std::move(mem),std::make_unique(cuExtMem)), + core::dont_grab + ); } + +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) +{ + auto& cu = m_handler->getCUDAFunctionTable(); + auto handleType = sema->getCreationParams().externalHandleTypes.value; + + if (!handleType) + return nullptr; + + CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { +#ifdef _WIN32 + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, + .handle = {.win32 = {.handle = sema->getExportHandle() }}, +#else + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + .handle = {.fd = sema->getExportHandle()} #endif + }; + + + CUexternalSemaphore cusema; + if (const auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + { + m_logger.log("Fail to import semaphore into CUDA!"); + return nullptr; + } + + return core::smart_refctd_ptr( + new CCUDAImportedSemaphore(core::smart_refctd_ptr(this),std::move(sema),std::make_unique(cusema)), + core::dont_grab + ); +} + +CCUDADevice::~CCUDADevice() +{ + if (m_native->context) + m_handler->defaultHandleResult(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_native->context)); +} + +bool CCUDADevice::isValid() const +{ + return m_valid; +} + +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDADevice::SNativeState {}; + +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + std::unique_ptr&& nativeState, + core::smart_refctd_ptr&& handler) + : m_logger(nullptr) + , m_vulkanConnection(std::move(vulkanConnection)) + , m_virtualArchitecture(virtualArchitecture) + , m_valid(false) + , m_handler(std::move(handler)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDADevice::~CCUDADevice() = default; + +bool CCUDADevice::isValid() const +{ + return false; +} + +cuda_interop::SCUdevice CCUDADevice::getInternalObject() const +{ + return {}; +} + +cuda_interop::SCUcontext CCUDADevice::getContext() const +{ + return {}; +} + +core::smart_refctd_ptr CCUDADevice::createExportableMemory(SExportableMemoryCreationParams&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&&) +{ + return nullptr; +} } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp new file mode 100644 index 0000000000..58152c2fcd --- /dev/null +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -0,0 +1,113 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CUDAInterop.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + +namespace nbl::video +{ + +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} + +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +{ + auto pd = device->getPhysicalDevice(); + uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; + uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); + + if (m_params.deviceLocal) + memoryTypeBits &= vram; + else + memoryTypeBits &= ~vram; + + IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; + req.size = m_params.granularSize; + req.memoryTypeBits = memoryTypeBits; + req.prefersDedicatedAllocation = nullptr != dedication; + req.requiresDedicatedAllocation = nullptr != dedication; + + return device->allocate(req, + { + dedication, + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + m_params.externalHandle + }).memory; +} + +CCUDAExportableMemory::~CCUDAExportableMemory() +{ + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + + m_device->getHandler()->defaultHandleResult(cu.pcuMemUnmap(m_native->ptr, m_params.granularSize)); + + m_device->getHandler()->defaultHandleResult(cu.pcuMemAddressFree(m_native->ptr, m_params.granularSize)); + + if (!system::CloseExternalHandle(m_params.externalHandle)) + m_device->getHandler()->getLogger().log("Fail to close exported CUDA memory handle!", system::ILogger::ELL_ERROR); + +} + +cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const +{ + return m_native->ptr; +} + +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAExportableMemory::SNativeState {}; + +CCUDAExportableMemory::CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +core::smart_refctd_ptr CCUDAExportableMemory::create(core::smart_refctd_ptr device, SCachedCreationParams&& params, std::unique_ptr&& nativeState) +{ + return core::smart_refctd_ptr( + new CCUDAExportableMemory(std::move(device),std::move(params),std::move(nativeState)), + core::dont_grab + ); +} + +CCUDAExportableMemory::~CCUDAExportableMemory() = default; + +cuda_interop::SCUdeviceptr CCUDAExportableMemory::getDeviceptr() const +{ + return {}; +} + +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice*, IDeviceMemoryBacked*) const +{ + return nullptr; +} + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 7fb60d79bf..c07af698b1 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -2,17 +2,527 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDAHandler.h" +#include "nbl/video/CUDAInterop.h" + +#include "nlohmann/json.hpp" + +#include +#include +#include +#include +#include + +namespace nbl::video::cuda_interop +{ +namespace +{ + +#if defined(_NBL_PLATFORM_WINDOWS_) +inline constexpr char EnvironmentPathListSeparator = ';'; +#else +inline constexpr char EnvironmentPathListSeparator = ':'; +#endif + +std::string readEnvironmentVariable(const char* name) +{ + if (const char* value = std::getenv(name)) + return value; + return {}; +} + +bool isDirectory(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_directory(path,error); +} + +bool isRegularFile(const system::path& path) +{ + std::error_code error; + return std::filesystem::exists(path,error) && std::filesystem::is_regular_file(path,error); +} + +system::path normalizedAbsolute(system::path path) +{ + std::error_code error; + auto absolute = std::filesystem::absolute(path,error); + if (error) + absolute = std::move(path); + return absolute.lexically_normal(); +} + +bool looksLikeCUDAIncludeDir(const system::path& path) +{ + if (!isDirectory(path)) + return false; + + return isRegularFile(path/"cuda_fp16.h") || + isRegularFile(path/"cuda_runtime_api.h") || + isRegularFile(path/"vector_types.h") || + isRegularFile(path/"cuda.h") || + isRegularFile(path/"nv"/"target"); +} + +uint32_t readCUDAVersion(const system::path& includeDir) +{ + std::ifstream input(includeDir/"cuda.h"); + if (!input) + return 0u; + + std::string line; + while (std::getline(input,line)) + { + std::istringstream stream(line); + std::string directive; + stream >> directive; + if (directive!="#define") + continue; + + std::string name; + stream >> name; + if (name!="CUDA_VERSION") + continue; + + uint32_t version = 0u; + if (stream >> version) + return version; + } + return 0u; +} + +bool looksLikeCompleteRuntimeHeaderSet(const system::path& includeDir) +{ + return isRegularFile(includeDir/"cuda.h") && + isRegularFile(includeDir/"cuda_runtime_api.h") && + isRegularFile(includeDir/"vector_types.h"); +} + +void appendIncludeDir(SRuntimeCompileEnvironment& environment, system::path path, std::string source) +{ + if (path.empty() || !looksLikeCUDAIncludeDir(path)) + return; + + path = normalizedAbsolute(std::move(path)); + const auto pathString = path.generic_string(); + const auto alreadyAdded = std::find_if(environment.includeDirs.begin(),environment.includeDirs.end(),[&](const system::path& existing) { + return existing.generic_string()==pathString; + }); + if (alreadyAdded==environment.includeDirs.end()) + { + SRuntimeIncludeDir info; + info.path = path; + info.source = std::move(source); + info.cudaVersion = readCUDAVersion(path); + info.completeRuntimeHeaderSet = looksLikeCompleteRuntimeHeaderSet(path); + + environment.includeDirs.push_back(std::move(path)); + environment.includeDirInfos.push_back(std::move(info)); + } +} + +void appendCUDAIncludeDirsBelow(SRuntimeCompileEnvironment& environment, const system::path& root, uint32_t maxDepth, std::string source) +{ + if (!isDirectory(root)) + return; + + if (looksLikeCUDAIncludeDir(root)) + { + appendIncludeDir(environment,root,std::move(source)); + return; + } + if (maxDepth==0u) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + + std::error_code entryError; + if (!entry.is_directory(entryError)) + continue; + candidates.push_back(entry.path()); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendCUDAIncludeDirsBelow(environment,candidate,maxDepth-1u,source); +} + +void appendCUDAIncludeRoot(SRuntimeCompileEnvironment& environment, const system::path& root, std::string source) +{ + if (root.empty()) + return; + + appendIncludeDir(environment,root,source); + appendIncludeDir(environment,root/"include",std::move(source)); +} + +void appendRuntimePathsConfig(SRuntimeCompileEnvironment& environment, const system::path& configFile, const char* source) +{ + if (!isRegularFile(configFile)) + return; + + std::ifstream input(configFile); + if (!input) + return; + + const auto json = nlohmann::json::parse(input,nullptr,false); + if (json.is_discarded()) + return; + + const auto paths = json.find("cudaRuntimeIncludeDirs"); + if (paths==json.end() || !paths->is_array()) + return; + + for (const auto& path : *paths) + if (path.is_string()) + appendIncludeDir(environment,system::path(path.get()),std::string(source)+": "+configFile.generic_string()); +} + +template +void appendPathListEnv(const char* name, Append append) +{ + const auto value = readEnvironmentVariable(name); + if (value.empty()) + return; + + size_t begin = 0; + while (begin& explicitRuntimePathFiles) +{ + for (const auto& runtimePathFile : explicitRuntimePathFiles) + appendRuntimePathsConfig(environment,runtimePathFile,"explicit runtime JSON"); + + appendPathListEnv("NBL_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) { + appendRuntimePathsConfig(environment,path,"NBL_CUDA_INTEROP_RUNTIME_JSON"); + }); + appendPathListEnv("Nabla_CUDA_INTEROP_RUNTIME_JSON",[&](const system::path& path) { + appendRuntimePathsConfig(environment,path,"Nabla_CUDA_INTEROP_RUNTIME_JSON"); + }); + + const auto exeDir = system::executableDirectory(); + if (!exeDir.empty()) + appendRuntimePathsConfig(environment,exeDir/RuntimePathsFileName,"executable-local runtime JSON"); +} + +void appendAppLocalIncludeDirs(SRuntimeCompileEnvironment& environment) +{ + const auto exeDir = system::executableDirectory(); + if (exeDir.empty()) + return; + + appendIncludeDir(environment,exeDir/"cuda"/"include","app-local cuda/include"); + appendCUDAIncludeDirsBelow(environment,exeDir/"nvidia",4u,"app-local nvidia package"); + appendIncludeDir(environment,exeDir/"Libraries"/"cuda"/"include","app-local Libraries/cuda/include"); + appendIncludeDir(environment,exeDir.parent_path()/"cuda"/"include","parent app-local cuda/include"); + appendCUDAIncludeDirsBelow(environment,exeDir.parent_path()/"nvidia",4u,"parent app-local nvidia package"); +} + +void appendPythonPackageIncludeDirs(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source) +{ + if (root.empty()) + return; + + appendCUDAIncludeDirsBelow(environment,root/"Lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package"); + appendCUDAIncludeDirsBelow(environment,root/"lib"/"site-packages"/"nvidia",4u,std::string(source)+" Python nvidia package"); + appendIncludeDir(environment,root/"Library"/"include",std::string(source)+" Library/include"); + appendIncludeDir(environment,root/"include",std::string(source)+" include"); +} + +void appendEnvironmentIncludeDirs(SRuntimeCompileEnvironment& environment) +{ + appendPathListEnv("NBL_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) { + appendIncludeDir(environment,path,"NBL_CUDA_RUNTIME_INCLUDE_DIRS"); + }); + appendPathListEnv("Nabla_CUDA_RUNTIME_INCLUDE_DIRS",[&](const system::path& path) { + appendIncludeDir(environment,path,"Nabla_CUDA_RUNTIME_INCLUDE_DIRS"); + }); + + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_PATH"),"CUDA_PATH"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_HOME"),"CUDA_HOME"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDA_ROOT"),"CUDA_ROOT"); + appendCUDAIncludeRoot(environment,readEnvironmentVariable("CUDAToolkit_ROOT"),"CUDAToolkit_ROOT"); + + appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("VIRTUAL_ENV"),"VIRTUAL_ENV"); + appendPythonPackageIncludeDirs(environment,readEnvironmentVariable("CONDA_PREFIX"),"CONDA_PREFIX"); +} + +void appendCUDAInstallRoots(SRuntimeCompileEnvironment& environment, const system::path& root, const char* source) +{ + if (!isDirectory(root)) + return; + + core::vector candidates; + std::error_code error; + for (const auto& entry : std::filesystem::directory_iterator(root,error)) + { + if (error) + break; + if (!entry.is_directory(error)) + continue; + candidates.push_back(entry.path()/"include"); + } + + std::sort(candidates.begin(),candidates.end(),[](const system::path& lhs, const system::path& rhs) { + return lhs.generic_string()>rhs.generic_string(); + }); + for (const auto& candidate : candidates) + appendIncludeDir(environment,candidate,source); +} + +void appendSystemIncludeDirs(SRuntimeCompileEnvironment& environment) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + appendCUDAInstallRoots(environment,"C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA","system CUDA Toolkit install root"); + #else + appendIncludeDir(environment,"/usr/local/cuda/include","system /usr/local/cuda"); + appendCUDAInstallRoots(environment,"/usr/local","system /usr/local CUDA install root"); + appendIncludeDir(environment,"/usr/include","system /usr/include"); + #endif +} + +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs, const core::vector& runtimePathFiles) +{ + SRuntimeCompileEnvironment environment; + + /* + Runtime header discovery builds the ordered include list passed to NVRTC. It is not a lock to the CUDA SDK + used to build Nabla. A packaged Nabla must stay relocatable, so host-specific include paths are accepted + only when the application provides them at runtime: direct arguments, JSON next to the executable, an + override JSON, app-local header bundles, environment variables, or finally common toolkit install roots. + + The first root containing a requested header wins exactly like normal C/C++ include search. Keep every + accepted root with its source and parsed CUDA_VERSION so startup logs can explain what NVRTC will see. + This is also why mismatched or partial roots produce diagnostics instead of changing discovery order or + hard-failing before the user kernel is compiled. + */ + for (const auto& includeDir : explicitIncludeDirs) + appendIncludeDir(environment,includeDir,"explicit include dir"); + + appendRuntimePathsConfigs(environment,runtimePathFiles); + appendAppLocalIncludeDirs(environment); + appendEnvironmentIncludeDirs(environment); + appendSystemIncludeDirs(environment); + + return environment; +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment(const core::vector& explicitIncludeDirs) +{ + static const core::vector EmptyRuntimePathFiles; + return findRuntimeCompileEnvironment(explicitIncludeDirs,EmptyRuntimePathFiles); +} + +SRuntimeCompileEnvironment findRuntimeCompileEnvironment() +{ + static const core::vector EmptyIncludeDirs; + static const core::vector EmptyRuntimePathFiles; + return findRuntimeCompileEnvironment(EmptyIncludeDirs,EmptyRuntimePathFiles); +} + +} #ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" +#include "nbl/system/CFileView.h" #include "jitify/jitify.hpp" namespace nbl::video { + +namespace +{ + +int cudaVersionMajor(int version) +{ + return version/1000; +} + +int cudaVersionMinor(int version) +{ + return (version%1000)/10; +} + +int cudaVersionCode(int major, int minor) +{ + return major*1000+minor*10; +} + +system::path loadedRuntimeModulePath(const char* moduleName) +{ + #if defined(_NBL_PLATFORM_WINDOWS_) + const auto moduleDir = system::loadedModuleDirectory(moduleName); + if (moduleDir.empty()) + return {}; + return moduleDir/(std::string(moduleName)+".dll"); + #else + return {}; + #endif +} + +std::string cudaVersionString(int version) +{ + std::ostringstream stream; + stream << cudaVersionMajor(version) << "." << cudaVersionMinor(version); + return stream.str(); +} + +std::string cudaVersionString(const cuda_interop::SRuntimeVersion& version) +{ + std::ostringstream stream; + stream << version[0] << "." << version[1]; + return stream.str(); +} + +std::string runtimeIncludeDirDescription(const cuda_interop::SRuntimeIncludeDir& includeDir) +{ + std::ostringstream stream; + stream << includeDir.path.generic_string() << " (" << includeDir.source; + if (includeDir.cudaVersion!=0u) + stream << ", CUDA_VERSION " << includeDir.cudaVersion << " / " << cudaVersionString(includeDir.cudaVersion); + else + stream << ", CUDA_VERSION unknown"; + if (!includeDir.completeRuntimeHeaderSet) + stream << ", partial header root"; + stream << ")"; + return stream.str(); +} + +std::string cudaRuntimeReport( + const int buildVersion, const int cudaDriverVersion, const system::path& cudaDriverPath, + const cuda_interop::SRuntimeVersion& nvrtcVersion, const std::string& nvrtcLibraryName, const system::path& nvrtcPath, + const cuda_interop::SRuntimeCompileEnvironment& runtimeEnvironment) +{ + std::ostringstream stream; + stream << "CCUDAHandler: CUDA interop runtime report:\n"; + stream << " - Nabla build CUDA SDK: " << cudaVersionString(buildVersion) << "\n"; + stream << " - CUDA Driver API: " << cudaVersionString(cudaDriverVersion); + if (!cudaDriverPath.empty()) + stream << " (" << cudaDriverPath.generic_string() << ")"; + stream << "\n"; + stream << " - NVRTC runtime: " << cudaVersionString(nvrtcVersion) << " (" << nvrtcLibraryName; + if (!nvrtcPath.empty()) + stream << ", " << nvrtcPath.generic_string(); + stream << ")\n"; + + if (runtimeEnvironment.includeDirs.empty()) + { + stream << " - NVRTC runtime header search path: none discovered"; + } + else + { + stream << " - Primary NVRTC runtime header path: " << runtimeIncludeDirDescription(runtimeEnvironment.includeDirInfos.front()) << "\n"; + stream << " - NVRTC runtime header search order (first path containing the requested header wins):\n"; + for (const auto& includeDir : runtimeEnvironment.includeDirInfos) + stream << " - " << runtimeIncludeDirDescription(includeDir) << "\n"; + } + return stream.str(); +} + +} -bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) +CCUDAHandler::CCUDAHandler( + std::unique_ptr&& nativeState, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger) + : m_native(std::move(nativeState)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) +{ + assert(m_native); + + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + for (const auto& option : m_native->runtimeIncludeOptions) + m_native->runtimeIncludeOptionPtrs.push_back(option.c_str()); + + int deviceCount = 0; + if (m_native->cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + return; + + for (int device_i = 0; device_i < deviceCount; device_i++) + { + CUdevice handle = -1; + if (m_native->cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) + continue; + + CUuuid uuid = {}; + if (m_native->cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) + continue; + + auto& nativeDevice = m_native->deviceStates.emplace_back(); + nativeDevice.handle = handle; + nativeDevice.uuid = uuid; + auto& cleanDevice = m_availableDevices.emplace_back(); + memcpy(cleanDevice.uuid.data(),&uuid,cleanDevice.uuid.size()); + + for (size_t i = 0; i < nativeDevice.attributes.size(); i++) + m_native->cuda.pcuDeviceGetAttribute(&nativeDevice.attributes[i], static_cast(i), handle); + + } +} + +CCUDAHandler::~CCUDAHandler() = default; + +uint32_t CCUDAHandler::getBuildCUDASDKVersion() +{ + return CUDA_VERSION; +} + +uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const +{ + return m_native->cudaDriverVersion; +} + +cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const +{ + return m_native->nvrtcVersion; +} + +const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const +{ + return m_native->cuda; +} + +const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const +{ + return m_native->nvrtc; +} + +core::SRange CCUDAHandler::getDefaultRuntimeIncludeOptions() const { + if (m_native->runtimeIncludeOptionPtrs.empty()) + return {nullptr,nullptr}; + const auto* begin = m_native->runtimeIncludeOptionPtrs.data(); + return {begin,begin+m_native->runtimeIncludeOptionPtrs.size()}; +} + +bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult, const system::logger_opt_ptr& logger) +{ + const CUresult result = opaqueResult; switch (result) { case CUDA_SUCCESS: @@ -133,6 +643,11 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt This indicates that a PTX JIT compilation failed. )===",system::ILogger::ELL_ERROR); break; + case CUDA_ERROR_UNSUPPORTED_PTX_VERSION: + logger.log(R"===(CCUDAHandler: + This indicates that the PTX version is unsupported by the CUDA driver. Check that the CUDA driver runtime can consume PTX produced by the loaded NVRTC runtime. + )===",system::ILogger::ELL_ERROR); + break; case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: logger.log(R"===(CCUDAHandler: This indicates an error with OpenGL or DirectX context. @@ -370,34 +885,54 @@ bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt break; case CUDA_ERROR_UNKNOWN: default: - logger.log("CCUDAHandler: Unknown CUDA Error!\n",system::ILogger::ELL_ERROR); + logger.log("CCUDAHandler: Unknown CUDA error code %d.",system::ILogger::ELL_ERROR,static_cast(result)); break; } - _NBL_DEBUG_BREAK_IF(true); return false; } -bool CCUDAHandler::defaultHandleResult(nvrtcResult result) +bool CCUDAHandler::defaultHandleResult(cuda_interop::SCUresult opaqueResult) const { + const CUresult result = opaqueResult; + if (result==CUDA_ERROR_UNSUPPORTED_PTX_VERSION) + { + const auto cudaVersion = getLoadedCUDADriverVersion(); + const auto nvrtcVersion = getLoadedNVRTCVersion(); + getLogger().log( + "CCUDAHandler: CUDA driver API %d.%d rejected PTX produced through NVRTC %d.%d. Install a newer NVIDIA driver or use an NVRTC/runtime-header set compatible with the installed driver.", + system::ILogger::ELL_ERROR, + cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion), + nvrtcVersion[0],nvrtcVersion[1] + ); + } + return defaultHandleResult(opaqueResult,getLogger()); +} + +bool CCUDAHandler::defaultHandleResult(cuda_interop::SNVRTCResult opaqueResult) const +{ + const nvrtcResult result = opaqueResult; + const auto& nvrtc = getNVRTCFunctionTable(); + const auto logger = getLogger(); switch (result) { case NVRTC_SUCCESS: return true; break; default: - if (m_nvrtc.pnvrtcGetErrorString) - m_logger.log("%s\n",system::ILogger::ELL_ERROR,m_nvrtc.pnvrtcGetErrorString(result)); + if (nvrtc.pnvrtcGetErrorString) + logger.log("%s\n",system::ILogger::ELL_ERROR,nvrtc.pnvrtcGetErrorString(result)); else - m_logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); + logger.log(R"===(CudaHandler: `pnvrtcGetErrorString` is nullptr, the nvrtc library probably not found on the system.\n)===",system::ILogger::ELL_ERROR); break; } - _NBL_DEBUG_BREAK_IF(true); return false; } core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* system, core::smart_refctd_ptr&& _logger) { - CUDA cuda = CUDA( + const system::logger_opt_ptr logger(_logger.get()); + + cuda_native::CUDA cuda = cuda_native::CUDA( #if defined(_NBL_WINDOWS_API_) "nvcuda" #elif defined(_NBL_POSIX_API_) @@ -406,61 +941,182 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste #error "Unsuported Platform" #endif ); - - NVRTC nvrtc = {}; - #if defined(_NBL_WINDOWS_API_) - // Perpetual TODO: any new CUDA releases we need to account for? - const char* nvrtc64_versions[] = { "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; - const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; - for (auto verpath=nvrtc64_versions; *verpath; verpath++) - { - for (auto suffix=nvrtc64_suffices; *suffix; suffix++) - { - std::string path(*verpath); - path += *suffix; - nvrtc = NVRTC(path.c_str()); - if (nvrtc.pnvrtcVersion) - break; - } - if (nvrtc.pnvrtcVersion) - break; - } - #elif defined(_NBL_POSIX_API_) - nvrtc = NVRTC("nvrtc"); - //nvrtc_builtins = NVRTC("nvrtc-builtins"); - #else - #error "Unsuported Platform" - #endif - // need a complex safe calling chain because DLL/SO might not have loaded #define SAFE_CUDA_CALL(FUNC,...) \ {\ if (!cuda.p ## FUNC)\ + {\ + logger.log("CCUDAHandler: CUDA Driver API function %s was not found. Need CUDA driver runtime %d.%d or newer.",system::ILogger::ELL_ERROR,#FUNC,cudaVersionMajor(cuda_native::MinimumCUDADriverVersion),cudaVersionMinor(cuda_native::MinimumCUDADriverVersion));\ return nullptr;\ - auto result = cuda.p ## FUNC ## (__VA_ARGS__);\ + }\ + auto result = cuda.p ## FUNC(__VA_ARGS__);\ if (result!=CUDA_SUCCESS)\ + {\ + logger.log("CCUDAHandler: %s failed with CUDA error code %d.",system::ILogger::ELL_ERROR,#FUNC,static_cast(result));\ return nullptr;\ + }\ } SAFE_CUDA_CALL(cuInit,0) int cudaVersion = 0; SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) - if (cudaVersion<9000) + if (cudaVersion bool + { + if (!candidate.pnvrtcVersion) + return false; + + const auto result = candidate.pnvrtcVersion(version.data(),version.data()+1); + if (result==NVRTC_SUCCESS) + return true; + + logger.log("CCUDAHandler: nvrtcVersion failed for %s with NVRTC error code %d.",system::ILogger::ELL_WARNING,name,static_cast(result)); + version = {-1,-1}; + return false; + }; + + cuda_native::NVRTC nvrtc = {}; + cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1}; + std::string nvrtcLibraryName; + + #if defined(_NBL_WINDOWS_API_) + cuda_native::NVRTC fallbackNVRTC = {}; + cuda_interop::SRuntimeVersion fallbackNVRTCVersion = {-1,-1}; + std::string fallbackNVRTCLibraryName; + + /* + The CUDA driver consumes the final PTX, not the toolkit that provided headers or nvrtc*.dll. + A real machine can have an older NVIDIA driver and a newer CUDA Toolkit side by side, for example + driver API 13.1 from nvcuda.dll with CUDA 13.2 Toolkit/NVRTC in CUDA_PATH. In that setup NVRTC can + emit PTX the installed driver rejects with CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Prefer an NVRTC runtime + that is not newer than the loaded driver and log the full version matrix when no compatible one exists. + */ + const char* nvrtc64_versions[] = { + "nvrtc64_132", + "nvrtc64_131", + "nvrtc64_130", + nullptr + }; + + const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; + for (auto verpath=nvrtc64_versions; *verpath && !nvrtc.pnvrtcVersion; verpath++) + { + for (auto suffix=nvrtc64_suffices; *suffix; suffix++) + { + std::string candidateName(*verpath); + candidateName += *suffix; + + cuda_native::NVRTC candidate(candidateName.c_str()); + cuda_interop::SRuntimeVersion candidateVersion = {-1,-1}; + if (!readNVRTCVersion(candidate,candidateVersion,candidateName.c_str())) + continue; + + if (cudaVersionCode(candidateVersion[0],candidateVersion[1])<=cudaVersion) + { + nvrtc = std::move(candidate); + nvrtcVersion = candidateVersion; + nvrtcLibraryName = std::move(candidateName); + break; + } + + if (!fallbackNVRTC.pnvrtcVersion) + { + fallbackNVRTC = std::move(candidate); + fallbackNVRTCVersion = candidateVersion; + fallbackNVRTCLibraryName = std::move(candidateName); + } + } + } + + if (!nvrtc.pnvrtcVersion && fallbackNVRTC.pnvrtcVersion) + { + nvrtc = std::move(fallbackNVRTC); + nvrtcVersion = fallbackNVRTCVersion; + nvrtcLibraryName = std::move(fallbackNVRTCLibraryName); + } + #elif defined(_NBL_POSIX_API_) + nvrtcLibraryName = "nvrtc"; + nvrtc = cuda_native::NVRTC(nvrtcLibraryName.c_str()); + readNVRTCVersion(nvrtc,nvrtcVersion,nvrtcLibraryName.c_str()); + #else + #error "Unsuported Platform" + #endif // check nvrtc existence and compatibility if (!nvrtc.pnvrtcVersion) + { + logger.log("CCUDAHandler: NVRTC runtime was not found. Need NVRTC %d.x or newer.",system::ILogger::ELL_ERROR,cuda_native::MinimumNVRTCMajorVersion); return nullptr; - int nvrtcVersion[2] = { -1,-1 }; - nvrtc.pnvrtcVersion(nvrtcVersion+0,nvrtcVersion+1); - if (nvrtcVersion[0]<9) + } + if (nvrtcVersion[0]cudaVersion) + { + logger.log( + "CCUDAHandler: NVRTC runtime %d.%d is newer than CUDA driver API %d.%d. PTX generated by this NVRTC may be unsupported by the installed driver.", + system::ILogger::ELL_WARNING, + nvrtcVersion[0],nvrtcVersion[1], + cudaVersionMajor(cudaVersion),cudaVersionMinor(cudaVersion) + ); + } + if (runtimeEnvironment.includeDirs.empty()) + { + logger.log("CCUDAHandler: no CUDA runtime headers were discovered for NVRTC include paths.",system::ILogger::ELL_WARNING); + } + else + { + const auto& primaryIncludeDir = runtimeEnvironment.includeDirInfos.front(); + if (!primaryIncludeDir.completeRuntimeHeaderSet) + { + logger.log( + "CCUDAHandler: primary NVRTC runtime header path %s does not contain cuda.h, cuda_runtime_api.h, and vector_types.h together. NVRTC may use later include paths for missing headers.", + system::ILogger::ELL_WARNING, + primaryIncludeDir.path.generic_string().c_str() + ); + } + + const auto nvrtcVersionCode = cudaVersionCode(nvrtcVersion[0],nvrtcVersion[1]); + if (primaryIncludeDir.cudaVersion!=0u && primaryIncludeDir.cudaVersion!=static_cast(nvrtcVersionCode)) + { + logger.log( + "CCUDAHandler: primary NVRTC runtime headers report CUDA_VERSION %u (%s), while loaded NVRTC is %s. This is allowed by discovery policy, but kernels using version-specific CUDA headers may fail to compile.", + system::ILogger::ELL_WARNING, + primaryIncludeDir.cudaVersion, + cudaVersionString(primaryIncludeDir.cudaVersion).c_str(), + cudaVersionString(nvrtcVersion).c_str() + ); + } + } // add headers core::vector> headers; @@ -468,18 +1124,20 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { const void* contents = it.second.data(); headers.push_back(core::make_smart_refctd_ptr>( - core::smart_refctd_ptr(system),it.first.c_str(), + it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, + std::chrono::clock_cast(std::chrono::system_clock::now()), const_cast(contents),it.second.size()+1u )); } - - CCUDAHandler* handler = new CCUDAHandler(std::move(cuda), std::move(nvrtc),std::move(headers), std::move(_logger), cudaVersion); - return core::smart_refctd_ptr(handler,core::dont_grab); + return core::smart_refctd_ptr( + new CCUDAHandler(std::make_unique(std::move(cuda),std::move(nvrtc),cudaVersion,nvrtcVersion,std::move(runtimeEnvironment)),std::move(headers),std::move(_logger)), + core::dont_grab + ); } -nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) +cuda_interop::SNVRTCResult CCUDAHandler::createProgram(cuda_interop::SOutput prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) { #if defined(_NBL_WINDOWS_API_) source.insert(0ull,"#ifndef _WIN64\n#define _WIN64\n#endif\n"); @@ -488,33 +1146,94 @@ nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source #else #error "Unsuported Platform" #endif - return m_nvrtc.pnvrtcCreateProgram(prog,source.c_str(),name,headerCount,headerContents,includeNames); + nvrtcProgram nativeProgram = nullptr; + const auto result = getNVRTCFunctionTable().pnvrtcCreateProgram(&nativeProgram,source.c_str(),name,headerCount,headerContents,includeNames); + if (prog) + *prog = nativeProgram; + return result; } -nvrtcResult CCUDAHandler::getProgramLog(nvrtcProgram prog, std::string& log) +cuda_interop::SNVRTCResult CCUDAHandler::compileProgram(cuda_interop::SNVRTCProgram prog, core::SRange options) const +{ + const nvrtcProgram nativeProgram = prog; + return getNVRTCFunctionTable().pnvrtcCompileProgram(nativeProgram,options.size(),options.begin()); +} + +cuda_interop::SNVRTCResult CCUDAHandler::getProgramLog(cuda_interop::SNVRTCProgram prog, std::string& log) const { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetProgramLogSize(prog, &_size); + const nvrtcProgram nativeProgram = prog; + const auto& nvrtc = getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetProgramLogSize(nativeProgram, &_size); if (sizeRes != NVRTC_SUCCESS) return sizeRes; if (_size == 0ull) return NVRTC_ERROR_INVALID_INPUT; log.resize(_size); - return m_nvrtc.pnvrtcGetProgramLog(prog,log.data()); + return nvrtc.pnvrtcGetProgramLog(nativeProgram,log.data()); } -CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) +CCUDAHandler::SPTXResult CCUDAHandler::getPTX(cuda_interop::SNVRTCProgram prog) const { size_t _size = 0ull; - nvrtcResult sizeRes = m_nvrtc.pnvrtcGetPTXSize(prog,&_size); + const nvrtcProgram nativeProgram = prog; + const auto& nvrtc = getNVRTCFunctionTable(); + nvrtcResult sizeRes = nvrtc.pnvrtcGetPTXSize(nativeProgram,&_size); if (sizeRes!=NVRTC_SUCCESS) return {nullptr,sizeRes}; if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = asset::ICPUBuffer::create({ _size }); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; + asset::ICPUBuffer::SCreationParams ptxParams = {}; + ptxParams.size = _size; + auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); + auto ptxPtr = static_cast(ptx->getPointer()); + return {std::move(ptx),nvrtc.pnvrtcGetPTX(nativeProgram,ptxPtr)}; +} + +static CCUDAHandler::SPTXResult compileDirectlyToPTX_impl(CCUDAHandler& handler, cuda_interop::SNVRTCResult result, cuda_interop::SNVRTCProgram program, core::SRange nvrtcOptions, std::string* log) +{ + if (log) + log->clear(); + const nvrtcResult nativeResult = result; + if (nativeResult!=NVRTC_SUCCESS) + return {nullptr,result}; + + const auto runtimeIncludeOptions = handler.getDefaultRuntimeIncludeOptions(); + core::vector options; + options.reserve(nvrtcOptions.size()+runtimeIncludeOptions.size()); + for (const auto option : nvrtcOptions) + options.push_back(option); + for (const auto option : runtimeIncludeOptions) + options.push_back(option); + + const auto* optionsBegin = options.empty() ? nullptr:options.data(); + const auto* optionsEnd = options.empty() ? nullptr:optionsBegin+options.size(); + result = handler.compileProgram(program,{optionsBegin,optionsEnd}); + if (log) + handler.getProgramLog(program,*log); + if (static_cast(result)!=NVRTC_SUCCESS) + return {nullptr,result}; + + return handler.getPTX(program); +} + +CCUDAHandler::SPTXResult CCUDAHandler::compileDirectlyToPTX( + std::string&& source, const char* filename, core::SRange nvrtcOptions, + std::string* log, const int headerCount, const char* const* headerContents, const char* const* includeNames) +{ + cuda_interop::SNVRTCProgram program = {}; + cuda_interop::SNVRTCResult result = NVRTC_ERROR_PROGRAM_CREATION_FAILURE; + auto cleanup = core::makeRAIIExiter([&]() -> void + { + nvrtcProgram nativeProgram = program; + if (nativeProgram) + getNVRTCFunctionTable().pnvrtcDestroyProgram(&nativeProgram); + }); + + result = createProgram(program,std::move(source),filename,headerCount,headerContents,includeNames); + return compileDirectlyToPTX_impl(*this,result,program,nvrtcOptions,log); } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) @@ -525,28 +1244,13 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) - return nullptr; - - for (int ordinal=0; ordinaldeviceStates) { - CUdevice handle = -1; - if (m_cuda.pcuDeviceGet(&handle,ordinal)!=CUDA_SUCCESS || handle<0) - continue; - - CUuuid uuid = {}; - if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) - continue; - if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - for (int i=0; i(i),handle); - CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; - const int& archMajor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; - const int& archMinor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; + const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; + const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; switch (archMajor) { case 3: @@ -624,10 +1328,78 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); - } - } + auto cudaDevice = core::smart_refctd_ptr( + new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch,std::make_unique(device.handle),core::smart_refctd_ptr(this)), + core::dont_grab + ); + if (!cudaDevice->isValid()) + return nullptr; + return std::move(cudaDevice); + } + } + return nullptr; +} + +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAHandler::SNativeState {}; + +CCUDAHandler::CCUDAHandler( + std::unique_ptr&& nativeState, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger) + : m_native(std::move(nativeState)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) +{ + assert(m_native); +} + +CCUDAHandler::~CCUDAHandler() = default; + +uint32_t CCUDAHandler::getBuildCUDASDKVersion() +{ + return 0u; +} + +uint32_t CCUDAHandler::getLoadedCUDADriverVersion() const +{ + return 0u; +} + +cuda_interop::SRuntimeVersion CCUDAHandler::getLoadedNVRTCVersion() const +{ + return {-1,-1}; +} + +const cuda_native::CUDA& CCUDAHandler::getCUDAFunctionTable() const +{ + std::abort(); +} + +const cuda_native::NVRTC& CCUDAHandler::getNVRTCFunctionTable() const +{ + std::abort(); +} + +core::SRange CCUDAHandler::getDefaultRuntimeIncludeOptions() const +{ + return {nullptr,nullptr}; +} + +core::smart_refctd_ptr CCUDAHandler::create(system::ISystem*, core::smart_refctd_ptr&&) +{ + return nullptr; +} + +core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&&, IPhysicalDevice*) +{ return nullptr; } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp new file mode 100644 index 0000000000..ec5438643f --- /dev/null +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CUDAInterop.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + +namespace nbl::video +{ + +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const +{ + return m_native->handle; +} + +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput mappedBuffer) const +{ + if (!mappedBuffer) + return false; + + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; + bufferDesc.offset = 0; + bufferDesc.size = m_src->getAllocationSize(); + + CUdeviceptr nativeMappedBuffer = 0; + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + const auto result = cu.pcuExternalMemoryGetMappedBuffer(&nativeMappedBuffer, m_native->handle, &bufferDesc); + if (!m_device->getHandler()->defaultHandleResult(result)) + return false; + + *mappedBuffer = nativeMappedBuffer; + return true; +} + +CCUDAImportedMemory::~CCUDAImportedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalMemory(m_native->handle)); +} + +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedMemory::SNativeState {}; + +CCUDAImportedMemory::CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDAImportedMemory::~CCUDAImportedMemory() = default; + +cuda_interop::SCUexternalMemory CCUDAImportedMemory::getInternalObject() const +{ + return {}; +} + +bool CCUDAImportedMemory::getMappedBuffer(cuda_interop::SOutput) const +{ + return false; +} + +} + +#endif diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp new file mode 100644 index 0000000000..49495e11e2 --- /dev/null +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -0,0 +1,57 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CUDAInterop.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +#include "CUDAInteropNativeState.hpp" + +namespace nbl::video +{ +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const +{ + return m_native->handle; +} + +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + m_device->getHandler()->defaultHandleResult(cu.pcuDestroyExternalSemaphore(m_native->handle)); +} +} + +#else + +namespace nbl::video +{ + +// CUDA OFF stub keeps the clean public API linkable and reports feature absence with nullptr instead of unresolved symbols. +struct CCUDAImportedSemaphore::SNativeState {}; + +CCUDAImportedSemaphore::CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, std::unique_ptr&& nativeState) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_native(std::move(nativeState)) +{ + assert(m_native); +} + +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() = default; + +cuda_interop::SCUexternalSemaphore CCUDAImportedSemaphore::getInternalObject() const +{ + return {}; +} + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CUDAInteropNativeState.hpp b/src/nbl/video/CUDAInteropNativeState.hpp new file mode 100644 index 0000000000..04a70c6e4e --- /dev/null +++ b/src/nbl/video/CUDAInteropNativeState.hpp @@ -0,0 +1,81 @@ +#ifndef _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ +#define _NBL_VIDEO_CUDA_INTEROP_NATIVE_STATE_H_INCLUDED_ + +#include "nbl/video/CUDAInteropNativeAPI.h" + +#include +#include + +namespace nbl::video +{ + +struct CCUDAHandler::SNativeState +{ + struct SDeviceState + { + CUdevice handle = {}; + CUuuid uuid = {}; + std::array attributes = {}; + }; + + cuda_native::CUDA cuda; + cuda_native::NVRTC nvrtc; + int cudaDriverVersion = 0; + cuda_interop::SRuntimeVersion nvrtcVersion = {-1,-1}; + // Snapshot discovery at handler creation so diagnostics and NVRTC compile options describe the same runtime setup. + cuda_interop::SRuntimeCompileEnvironment runtimeEnvironment; + core::vector runtimeIncludeOptions; + core::vector runtimeIncludeOptionPtrs; + core::vector deviceStates; + + SNativeState( + cuda_native::CUDA&& _cuda, + cuda_native::NVRTC&& _nvrtc, + int _cudaDriverVersion, + cuda_interop::SRuntimeVersion _nvrtcVersion, + cuda_interop::SRuntimeCompileEnvironment&& _runtimeEnvironment) + : cuda(std::move(_cuda)) + , nvrtc(std::move(_nvrtc)) + , cudaDriverVersion(_cudaDriverVersion) + , nvrtcVersion(_nvrtcVersion) + , runtimeEnvironment(std::move(_runtimeEnvironment)) + , runtimeIncludeOptions(cuda_interop::makeNVRTCIncludeOptions(runtimeEnvironment)) + {} +}; + +struct CCUDADevice::SNativeState +{ + CUdevice handle = {}; + CUcontext context = nullptr; + + explicit SNativeState(CUdevice _handle) + : handle(_handle) + {} +}; + +struct CCUDAExportableMemory::SNativeState +{ + CUdeviceptr ptr = 0; +}; + +struct CCUDAImportedMemory::SNativeState +{ + CUexternalMemory handle = nullptr; + + explicit SNativeState(CUexternalMemory _handle) + : handle(_handle) + {} +}; + +struct CCUDAImportedSemaphore::SNativeState +{ + CUexternalSemaphore handle = nullptr; + + explicit SNativeState(CUexternalSemaphore _handle) + : handle(_handle) + {} +}; + +} + +#endif diff --git a/src/nbl/video/CVulkanBuffer.h b/src/nbl/video/CVulkanBuffer.h index 4596981c2a..944d7db205 100644 --- a/src/nbl/video/CVulkanBuffer.h +++ b/src/nbl/video/CVulkanBuffer.h @@ -16,7 +16,7 @@ class CVulkanBuffer : public CVulkanDeviceMemoryBacked using base_t = CVulkanDeviceMemoryBacked; public: - inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, const VkBuffer buffer) : base_t(dev,std::move(creationParams),buffer) {} + inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly, const VkBuffer buffer) : base_t(dev, std::move(creationParams), dedicatedOnly, buffer) {} void setObjectDebugName(const char* label) const override; diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index a04b5940ce..40b20bb5d2 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -90,10 +90,10 @@ void fill(vk_barrier_t& out, const ResourceBarrier& in, uint32_t selfQueueFamily switch (in.ownershipOp) { case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE: - out.dstQueueFamilyIndex = in.otherQueueFamilyIndex; + out.dstQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE: - out.srcQueueFamilyIndex = in.otherQueueFamilyIndex; + out.srcQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; } } diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 90b2993cb3..955885b7ae 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -6,7 +6,7 @@ namespace nbl::video { template -IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle) +IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle) { const std::conditional_t vk_memoryRequirementsInfo = { IsImage ? VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2:VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,nullptr,vkHandle @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) { + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; - type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR + type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR, or VkSemaphoreTypeCreateInfo type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; type.initialValue = initialValue; @@ -67,11 +67,71 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u createInfo.flags = static_cast(0); // flags must be 0 VkSemaphore semaphore; - if (m_devf.vk.vkCreateSemaphore(m_vkdev,&createInfo,nullptr,&semaphore)==VK_SUCCESS) - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this),semaphore); - else + if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) + return nullptr; + + ISemaphore::SCreationParams creationParams; + creationParams.initialValue = initialValue; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore, system::ExternalHandleNull); +} + +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(ISemaphore::SCreationParams&& creationParams) +{ + + // TODO(kevin) : Handle importing external semaphore into Vulkan + // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; + + VkExportSemaphoreCreateInfo exportInfo = { + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, + nullptr, + static_cast(creationParams.externalHandleTypes.value) + }; + + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; + type.pNext = creationParams.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR, or VkSemaphoreTypeCreateInfo + type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; + type.initialValue = creationParams.initialValue; + + VkSemaphoreCreateInfo createInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,&type }; + createInfo.flags = static_cast(0); // flags must be 0 + + VkSemaphore semaphore; + if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; + + system::external_handle_t externalHandle = system::ExternalHandleNull; + const auto handleType = static_cast(creationParams.externalHandleTypes.value); + if (handleType != 0) + { +#ifdef _WIN32 + VkSemaphoreGetWin32HandleInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = semaphore, + .handleType = handleType, + }; + + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); + return nullptr; + } +#else + VkSemaphoreGetFdInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .semaphore = vkSemaphore, + .handleType = handleType, + }; + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreFdKHR(m_vkdev, &props, &externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); + return nullptr; + } +#endif + } + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore, externalHandle); } + ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) { using retval_t = ISemaphore::WAIT_RESULT; @@ -136,26 +196,72 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } - IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { - IDeviceMemoryAllocator::SAllocation ret = {}; if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) - return ret; + return {}; - const core::bitflag allocateFlags(info.flags); VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr }; { - if (allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) + if (info.allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) vk_allocateFlagsInfo.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now } VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr}; + +#ifdef _WIN32 + VkImportMemoryWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + }; + + VkExportMemoryWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL, + }; +#else + VkImportMemoryFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .fd = info.importHandle, + }; +#endif + + VkExportMemoryAllocateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif + .handleTypes = static_cast(info.externalHandleType), + }; + + const void** pNext = &vk_allocateFlagsInfo.pNext; + + system::external_handle_t externalHandle = system::ExternalHandleNull; + if (info.externalHandleType) + { + if (info.importHandle) //importing + { + externalHandle = system::DuplicateExternalHandle(info.importHandle); +#ifdef _WIN32 + importInfo.handle = externalHandle; +#else + importInfo.fd = externalHandle; +#endif + *pNext = &importInfo; + } + else // exporting + *pNext = &exportInfo; + pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext; + } + if(info.dedication) { // VK_KHR_dedicated_allocation is in core 1.1, no querying for support needed static_assert(MinimumVulkanApiVersion >= VK_MAKE_API_VERSION(0,1,1,0)); - vk_allocateFlagsInfo.pNext = &vk_dedicatedInfo; + *pNext = &vk_dedicatedInfo; + pNext = &vk_dedicatedInfo.pNext; + switch (info.dedication->getObjectType()) { case IDeviceMemoryBacked::EOT_BUFFER: @@ -166,22 +272,65 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca break; default: assert(false); - return ret; + return {}; break; } } VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo}; - vk_allocateInfo.allocationSize = info.size; + vk_allocateInfo.allocationSize = info.allocationSize; vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; VkDeviceMemory vk_deviceMemory; auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory); if (vk_res!=VK_SUCCESS) - return ret; + return {}; + + const bool exported = info.externalHandleType && !info.importHandle; + + if (exported) + { +#ifdef _WIN32 + VkMemoryGetWin32HandleInfoKHR +#else + VkMemoryGetFdInfoKHR +#endif + handleInfo = { .sType = +#ifdef _WIN32 + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#else + VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, +#endif + .memory = vk_deviceMemory, + .handleType = static_cast(info.externalHandleType), + }; + + /* + For handle types defined as NT handles, + the handles returned by vkGetMemoryWin32HandleKHR are owned by the application + and hold a reference to their payload. To avoid leaking resources, + the application must release ownership of them + using the CloseHandle system call when they are no longer needed. + */ + + if (VK_SUCCESS != m_devf.vk. +#ifdef _WIN32 + vkGetMemoryWin32HandleKHR +#else + vkGetMemoryFdKHR +#endif + (m_vkdev, &handleInfo, &externalHandle)) + { + m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); + return {}; + } + + } // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; - ret.memory = core::make_smart_refctd_ptr(this,info.size,allocateFlags,memoryPropertyFlags,info.dedication,vk_deviceMemory); + CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; + IDeviceMemoryAllocator::SAllocation ret = {}; + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, externalHandle, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) { @@ -299,11 +448,17 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin } -core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) +core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { + + VkExternalMemoryBufferCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO, + .handleTypes = creationParams.externalHandleTypes.value, + }; + VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR - vk_createInfo.pNext = nullptr; + vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage); @@ -319,7 +474,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUB VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(creationParams),vk_buffer); + return core::make_smart_refctd_ptr(this, std::move(creationParams), dedicatedOnly, vk_buffer); } core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) @@ -338,7 +493,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_im return nullptr; } -core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params) +core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) { const bool hasStencil = asset::isDepthOrStencilFormat(params.format) && !asset::isDepthOnlyFormat(params.format); VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, nullptr }; @@ -354,7 +509,14 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast(fmt)); vk_formatListStruct.pViewFormats = vk_formatList.data(); + const bool external = params.externalHandleTypes.value; + VkExternalMemoryImageCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, + .handleTypes = params.externalHandleTypes.value, + }; + VkImageCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, &vk_formatListStruct }; + vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(params.flags.value); vk_createInfo.imageType = static_cast(params.type); vk_createInfo.format = getVkFormatFromFormat(params.format); @@ -372,12 +534,13 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = params.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices; - vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED:VK_IMAGE_LAYOUT_UNDEFINED; + // The Vulkan spec states: If the pNext chain includes a VkExternalMemoryImageCreateInfo or VkExternalMemoryImageCreateInfoNV structure whose handleTypes member is not 0, initialLayout must be VK_IMAGE_LAYOUT_UNDEFINED + vk_createInfo.initialLayout = external ? VK_IMAGE_LAYOUT_UNDEFINED : (params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED); VkImage vk_image; if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(params),vk_image); + return core::make_smart_refctd_ptr(this, std::move(params), dedicatedOnly, vk_image); } core::smart_refctd_ptr CVulkanLogicalDevice::createImageView_impl(IGPUImageView::SCreationParams&& params) @@ -548,7 +711,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDesc vkDescSetLayoutBinding.stageFlags = getVkShaderStageFlagsFromShaderStage(binding.stageFlags); vkDescSetLayoutBinding.pImmutableSamplers = nullptr; - if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER or binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) and binding.immutableSamplers and binding.count) + if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER || binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) && binding.immutableSamplers && binding.count) { // If descriptorType is VK_DESCRIPTOR_TYPE_SAMPLER or VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, and descriptorCount is not 0 and pImmutableSamplers is not NULL: // pImmutableSamplers must be a valid pointer to an array of descriptorCount valid VkSampler handles. diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index e77386cb34..0f2d7f160f 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -54,6 +54,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice // sync stuff core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) override; + core::smart_refctd_ptr createSemaphore(ISemaphore::SCreationParams&& creationParams = {}) override; ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; @@ -110,9 +111,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) override; // descriptor creation - core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) override; + core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) override; core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) override; - core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) override; + core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) override; core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) override; VkAccelerationStructureKHR createAccelerationStructure(const IGPUAccelerationStructure::SCreationParams& params, const VkAccelerationStructureTypeKHR type, const VkAccelerationStructureMotionInfoNV* motionInfo=nullptr); inline core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) override diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 5a4dfd5ff5..dd2df9ea29 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -4,14 +4,20 @@ namespace nbl::video { CVulkanMemoryAllocation::CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle -) : IDeviceMemoryAllocation(dev,size,flags,memoryPropertyFlags,isDedicated), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + const system::external_handle_t externalHandle, + SCreationParams&& params +) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle), m_externalHandle(externalHandle) {} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { + if (m_externalHandle != system::ExternalHandleNull) + { + const auto success = system::CloseExternalHandle(m_externalHandle); + if (!success) m_vulkanDevice->getLogger()->log("Failed to close external handle for Vulkan memory allocation", system::ILogger::ELL_ERROR); + assert(success); + } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); } diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 470e914ae3..5833384f9b 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -15,14 +15,22 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation { public: CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + const system::external_handle_t externalHandle, + SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } + inline system::external_handle_t getExportHandle() const override + { + // Do not return duplicated importHandle + if (m_params.importHandle == nullptr) + return m_externalHandle; + return nullptr; + } + private: ~CVulkanMemoryAllocation(); @@ -31,6 +39,10 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation core::smart_refctd_ptr m_vulkanDevice; const VkDeviceMemory m_deviceMemoryHandle; + + // Can store either duplicated importHandle or exportHandle. + // This handle will be closed when destructor is called, unlike importHandle in SCreationParams. + const system::external_handle_t m_externalHandle; }; } diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 65a0c358cc..03647a12f2 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1,5 +1,6 @@ #include "nbl/video/CVulkanPhysicalDevice.h" #include "nbl/video/CVulkanLogicalDevice.h" +#include "nbl/video/IGPUImage.h" namespace nbl::video { @@ -1385,6 +1386,63 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart #undef RETURN_NULL_PHYSICAL_DEVICE +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + assert(!(handleType & (handleType - 1))); + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO, + .usage = static_cast(usages.value), + .handleType = static_cast(handleType) + }; + VkExternalBufferProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES }; + vkGetPhysicalDeviceExternalBufferProperties(m_vkPhysicalDevice, &info, &externalProps); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + VkPhysicalDeviceExternalImageFormatInfo externalImageFormatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO, + .handleType = static_cast(handleType), + }; + + VkPhysicalDeviceImageFormatInfo2 formatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = &externalImageFormatInfo, + .format = getVkFormatFromFormat(info.format), + .type = static_cast(info.type), + .tiling = static_cast(info.tiling), + .usage = getVkImageUsageFlagsFromImageUsageFlags(info.usage.value, asset::isDepthOrStencilFormat(info.format)), + .flags = static_cast(info.flags.value), + }; + + VkExternalImageFormatProperties externalProps = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES, + }; + VkImageFormatProperties2 props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + .pNext = &externalProps, + }; + + auto re = vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &formatInfo, &props); + assert(VK_SUCCESS == re); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) { // We might alter it to account for dependancies. diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index c1552c88f1..40e0dd78fe 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,6 +109,10 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } + SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; + + SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; + core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; private: diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 071c4b2843..24c11e01fa 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -7,8 +7,15 @@ namespace nbl::video CVulkanSemaphore::~CVulkanSemaphore() { - const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - vulkanDevice->getFunctionTable()->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); + auto* vk = vulkanDevice->getFunctionTable(); + vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + if (m_creationParams.externalHandleTypes != EHT_NONE) + { + const auto success = system::CloseExternalHandle(m_externalHandle); + if (!success) vulkanDevice->getLogger()->log("Failed to close external handle for Vulkan semaphore", system::ILogger::ELL_ERROR); + assert(success); + } } uint64_t CVulkanSemaphore::getCounterValue() const diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 9290110d8d..6a5b66b9ac 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,8 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const system::external_handle_t externalHandle) + : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore), m_externalHandle(externalHandle) {} ~CVulkanSemaphore(); uint64_t getCounterValue() const override; @@ -24,11 +24,16 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} + system::external_handle_t getExportHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; private: const VkSemaphore m_semaphore; + + // Can store either duplicated importHandle or exportHandle. + // For now, it only store exportHandle, since we haven't support importing external semaphore yet + const system::external_handle_t m_externalHandle; }; } diff --git a/src/nbl/video/IDeviceMemoryAllocation.cpp b/src/nbl/video/IDeviceMemoryAllocation.cpp index 058f391de1..5f05e8d928 100644 --- a/src/nbl/video/IDeviceMemoryAllocation.cpp +++ b/src/nbl/video/IDeviceMemoryAllocation.cpp @@ -14,7 +14,7 @@ IDeviceMemoryAllocation::MemoryRange IDeviceMemoryAllocation::alignNonCoherentRa { const auto alignment = m_originDevice->getPhysicalDevice()->getLimits().nonCoherentAtomSize; range.offset = core::alignDown(range.offset,alignment); - range.length = core::min(core::alignUp(range.length,alignment),m_allocationSize); + range.length = core::min(core::alignUp(range.length,alignment),m_params.allocationSize); return range; } diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index bee6381f7a..6c414d2e82 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -298,6 +298,38 @@ bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyIndex, asse return true; } +core::smart_refctd_ptr ILogicalDevice::createBuffer(IGPUBuffer::SCreationParams&& creationParams) +{ + const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; + if (creationParams.size > maxSize) + { + m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!", system::ILogger::ELL_ERROR, creationParams.size, this, maxSize); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + return createBuffer_impl(std::move(creationParams), dedicatedOnly); +} IQueue::RESULT ILogicalDevice::waitIdle() { @@ -324,6 +356,50 @@ core::smart_refctd_ptr ILogicalDevice::createBufferView(const as return createBufferView_impl(underlying, _fmt); } +core::smart_refctd_ptr ILogicalDevice::createImage(IGPUImage::SCreationParams&& creationParams) +{ + if (!IGPUImage::validateCreationParameters(creationParams)) + { + m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); + return nullptr; + } + if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) + { + m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalImageProperties(IPhysicalDevice::SImageFormatInfo{ + .format = creationParams.format, + .type = creationParams.type, + .tiling = creationParams.tiling, + .usage = creationParams.usage, + .flags = creationParams.flags + }, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + + // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs + return createImage_impl(std::move(creationParams), dedicatedOnly); +} core::smart_refctd_ptr ILogicalDevice::compileShader(const SShaderCreationParameters& creationParams) { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 5bb8be8274..16b2851ad6 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2320,7 +2320,7 @@ class MetaDeviceMemoryAllocator final if (memReqs.requiresDedicatedAllocation) { // allocate and bind right away - auto allocation = m_allocator->allocate(memReqs,gpuObj); + auto allocation = m_allocator->allocate(memReqs, { gpuObj }); if (!allocation.isValid()) { m_logger.log("Failed to allocate and bind dedicated memory for %s",system::ILogger::ELL_ERROR,gpuObj->getObjectDebugName()); @@ -2459,12 +2459,11 @@ class MetaDeviceMemoryAllocator final failures.reserve(binItemCount); // ... using allocate_flags_t = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS; - IDeviceMemoryAllocator::SAllocateInfo info = { - .size = 0xdeadbeefBADC0FFEull, // set later - .flags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT:allocate_flags_t::EMAF_NONE, - .memoryTypeIndex = memTypeIx, - .dedication = nullptr - }; + IDeviceMemoryAllocator::SAllocateInfo info; + info.allocationSize = 0xdeadbeefBADC0FFEull; // set later + info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; + info.memoryTypeIndex = memTypeIx; + info.dedication = nullptr; // allocate in progression of combined allocations, while trying allocate as much as possible in a single allocation auto binItemsIt = binItems.begin(); for (auto firstOffsetIt=offsetsTmp.begin(); firstOffsetIt!=offsetsTmp.end(); ) @@ -2473,7 +2472,7 @@ class MetaDeviceMemoryAllocator final const size_t combinedCount = std::distance(firstOffsetIt,nextOffsetIt); const size_t lastIx = combinedCount-1; // if we take `combinedCount` starting at `firstItem` their allocation would need this size - info.size = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; + info.allocationSize = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; auto allocation = m_allocator->allocate(info); if (allocation.isValid()) {