From 0cd109a474a09947b135b3e9e04979d4af686b97 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:44:01 +0700 Subject: [PATCH 01/83] Add more cuda function to load --- include/nbl/video/CCUDAHandler.h | 51 +++++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..5f165b207a 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -18,7 +18,7 @@ namespace nbl::video class CCUDAHandler : public core::IReferenceCounted { - public: + public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); inline bool defaultHandleResult(CUresult result) { @@ -34,7 +34,7 @@ class CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; @@ -119,6 +119,24 @@ class CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -157,13 +175,25 @@ class CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::future bytesRead; + system::IFile::success_t bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); + source.resize(bytesRead.getBytesProcessed()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } + struct SCUDADeviceInfo + { + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + }; + + inline core::vector const& getAvailableDevices() const + { + return m_availableDevices; + } + // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -228,16 +258,8 @@ class CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + ~CCUDAHandler() = default; // @@ -260,6 +282,7 @@ class CCUDAHandler : public core::IReferenceCounted NVRTC m_nvrtc; // + core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; From bbe25abc6c93430b2ad4ad350fb8d37dd0bc3663 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:44:48 +0700 Subject: [PATCH 02/83] Add _NBL_COMPILE_WITH_CUDA_ compile definition on CMakeLists.txt --- src/nbl/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 9c994bfa41..a680a19eab 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -422,6 +422,10 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() +if(NBL_COMPILE_WITH_CUDA) + target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) +endif() + set(INTERFACE_BUILD_DEFINITIONS _DXC_DLL_="${DXC_DLL}" ) From d74349e590492206df24efa742ac76d8977f839a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:47:23 +0700 Subject: [PATCH 03/83] Move CCudaHandler constructor to cpp and query device info and attributes --- src/nbl/video/CCUDAHandler.cpp | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 7fb60d79bf..c111f3c73e 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -11,6 +11,47 @@ namespace nbl::video { +CCUDAHandler::CCUDAHandler( + CUDA&& _cuda, + NVRTC&& _nvrtc, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger, + int _version) + : m_cuda(std::move(_cuda)) + , m_nvrtc(std::move(_nvrtc)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) + , m_version(_version) +{ + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + return; + + for (int device_i = 0; device_i < deviceCount; device_i++) + { + CUdevice handle = -1; + if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) + continue; + + CUuuid uuid = {}; + if (m_cuda.pcuDeviceGetUuid(&uuid, handle) != CUDA_SUCCESS) + continue; + + m_availableDevices.emplace_back(handle, uuid); + + int* attributes = m_availableDevices.back().attributes; + for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) + m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + + } +} bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) From 38ed6dbd4affb940430f2367db4ad287ef8fe1e8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:47:58 +0700 Subject: [PATCH 04/83] Add missing CFileView.h header in CCudaHandler.cpp --- src/nbl/video/CCUDAHandler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index c111f3c73e..c8f8a328be 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -3,6 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDAHandler.h" +#include "nbl/system/CFileView.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "jitify/jitify.hpp" From 95338cd941a213381580bd01ddc64f2ff47e698f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 09:48:14 +0700 Subject: [PATCH 05/83] Fix indentation of CCudaHandler.cpp --- src/nbl/video/CCUDAHandler.cpp | 50 ++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index c8f8a328be..1f723ba641 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -53,6 +53,7 @@ CCUDAHandler::CCUDAHandler( } } + bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) @@ -452,7 +453,40 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste NVRTC nvrtc = {}; #if defined(_NBL_WINDOWS_API_) // Perpetual TODO: any new CUDA releases we need to account for? - const char* nvrtc64_versions[] = { "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; + // Version List: https://developer.nvidia.com/cuda-toolkit-archive + const char* nvrtc64_versions[] = { + "nvrtc64_131", + "nvrtc64_130", + "nvrtc64_129", + "nvrtc64_128", + "nvrtc64_126", + "nvrtc64_125", + "nvrtc64_124", + "nvrtc64_123", + "nvrtc64_122", + "nvrtc64_121", + "nvrtc64_120", + "nvrtc64_118", + "nvrtc64_117", + "nvrtc64_116", + "nvrtc64_115", + "nvrtc64_114", + "nvrtc64_113", + "nvrtc64_112", + "nvrtc64_111", + "nvrtc64_110", + "nvrtc64_102", + "nvrtc64_101", + "nvrtc64_100", + "nvrtc64_92", + "nvrtc64_91", + "nvrtc64_90", + "nvrtc64_80", + "nvrtc64_75", + "nvrtc64_70", + nullptr + }; + const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; for (auto verpath=nvrtc64_versions; *verpath; verpath++) { @@ -567,11 +601,11 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) return nullptr; - for (int ordinal=0; ordinal CCUDAHandler::createDevice(core::smart_refct CUuuid uuid = {}; if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) continue; - if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) { int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; for (int i=0; i CCUDAHandler::createDevice(core::smart_refct continue; auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); - } - } + return core::smart_refctd_ptr(device,core::dont_grab); + } + } return nullptr; } From 3e9dfd2e0c5a03171df9ab542c9499200814b225 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:22:04 +0700 Subject: [PATCH 06/83] Add NBL_API2 to CCudaHandler --- include/nbl/video/CCUDAHandler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 5f165b207a..ef040f5536 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,7 +16,7 @@ namespace nbl::video { -class CCUDAHandler : public core::IReferenceCounted +class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); From 1ae7747fecb3e3d080dbc90f44f0a1e86d977efe Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:22:40 +0700 Subject: [PATCH 07/83] Fix fetching deviceUUID logic --- src/nbl/video/CCUDAHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 1f723ba641..9dbf92e770 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -614,7 +614,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct CUuuid uuid = {}; if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) continue; - if (!memcmp(&uuid,&physicalDevice->getLimits().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; for (int i=0; i Date: Sat, 28 Feb 2026 15:22:58 +0700 Subject: [PATCH 08/83] Fix usage of CFileView --- src/nbl/video/CCUDAHandler.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 9dbf92e770..75e372b705 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -544,8 +544,10 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { const void* contents = it.second.data(); headers.push_back(core::make_smart_refctd_ptr>( - core::smart_refctd_ptr(system),it.first.c_str(), + it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, + // ASK(kevin): What initial_modified_time should I use? Is this how this parameter is used? + std::chrono::clock_cast(std::chrono::system_clock::now()), const_cast(contents),it.second.size()+1u )); } From 5018be7b8821ec3b09a381d2ca32d189a9d8f0df Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:23:13 +0700 Subject: [PATCH 09/83] Fix use after move of ptx cpuBuffer --- src/nbl/video/CCUDAHandler.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 75e372b705..d0f8043b17 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -591,8 +591,9 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = asset::ICPUBuffer::create({ _size }); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; + auto ptx = asset::ICPUBuffer::create({_size}); + auto ptxPtr = static_cast(ptx->getPointer()); + return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) From 5251b4def078cf20e2080bf89c9d55ac9c3781e8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 15:25:19 +0700 Subject: [PATCH 10/83] Improve cpuBuffer initialization using params instead of aggregrate initializer --- src/nbl/video/CCUDAHandler.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index d0f8043b17..4dbf0cb488 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -591,7 +591,9 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = asset::ICPUBuffer::create({_size}); + asset::ICPUBuffer::SCreationParams ptxParams = {}; + ptxParams.size = _size; + auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); auto ptxPtr = static_cast(ptx->getPointer()); return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } From d655b1977381c92abf0a0b496e818d63ae3ea009 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 16:48:58 +0700 Subject: [PATCH 11/83] Fix indentation of CCudaHandler.cpp into tabs --- src/nbl/video/CCUDAHandler.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 4dbf0cb488..aac9dc67cc 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -606,11 +606,11 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) return nullptr; - for (int ordinal=0; ordinal CCUDAHandler::createDevice(core::smart_refct CUuuid uuid = {}; if (m_cuda.pcuDeviceGetUuid(&uuid,handle)!=CUDA_SUCCESS) continue; - if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; for (int i=0; i CCUDAHandler::createDevice(core::smart_refct continue; auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); - } + return core::smart_refctd_ptr(device,core::dont_grab); } + } return nullptr; } From 454710b3aa6cbf8c303cfbbf5ff435218a38db42 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 16:54:49 +0700 Subject: [PATCH 12/83] Iterate m_availableDevices when creatingDevice --- src/nbl/video/CCUDAHandler.cpp | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index aac9dc67cc..add5e3db92 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -606,28 +606,13 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) - return nullptr; - - for (int ordinal=0; ordinalgetProperties().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - for (int i=0; i(i),handle); - CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; - const int& archMajor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; - const int& archMinor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; + const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; + const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; switch (archMajor) { case 3: From 4645bc4214422b4bb5678d0010d55d9d0792033d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 28 Feb 2026 18:07:36 +0700 Subject: [PATCH 13/83] Implement context creation in CCUDADevice --- include/nbl/video/CCUDADevice.h | 6 +++++- src/nbl/video/CCUDADevice.cpp | 21 +++++++++++++++++++-- src/nbl/video/CCUDAHandler.cpp | 3 +-- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 1120224fdb..b204b98b23 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -182,13 +182,17 @@ class CCUDADevice : public core::IReferenceCounted protected: friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); + CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); ~CCUDADevice() = default; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; IPhysicalDevice* const m_vulkanDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + + core::smart_refctd_ptr m_handler; + CUdevice m_handle; + CUcontext m_context; }; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 4d2e880095..bf96c6e78d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -7,13 +7,30 @@ namespace nbl::video { -CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture) - : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture) +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& _vulkanConnection, + IPhysicalDevice* const _vulkanDevice, + const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, + CUdevice _device, + core::smart_refctd_ptr&& _handler) : + m_defaultCompileOptions(), + m_vulkanConnection(std::move(_vulkanConnection)), + m_vulkanDevice(_vulkanDevice), + m_virtualArchitecture(_virtualArchitecture), + m_handle(_device), + m_handler(std::move(_handler)) { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); + + auto& cu = m_handler->getCUDAFunctionTable(); + + CUresult re = cu.pcuCtxCreate_v2(&m_context, 0, m_handle); + assert(CUDA_SUCCESS == re); + re = cu.pcuCtxSetCurrent(m_context); + assert(CUDA_SUCCESS == re); } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index add5e3db92..0eba770c89 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -690,8 +690,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); + return core::smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); } } return nullptr; From 3172ae76cf5da9a49bcb53bdd34b44830c6c125f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 12 Mar 2026 19:46:34 +0700 Subject: [PATCH 14/83] Implement physical device getExternalMemoryProperties --- include/nbl/video/IDeviceMemoryAllocation.h | 13 +++++++ include/nbl/video/IDeviceMemoryBacked.h | 2 ++ include/nbl/video/IPhysicalDevice.h | 40 +++++++++++++++++++++ src/nbl/video/CVulkanPhysicalDevice.cpp | 19 ++++++++++ src/nbl/video/CVulkanPhysicalDevice.h | 2 ++ 5 files changed, 76 insertions(+) diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 00e55a66e3..46f77975fb 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -68,6 +68,19 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted EMHF_MULTI_INSTANCE_BIT = 0x00000002, }; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D11_TEXTURE = 0x00000008, + EHT_D3D11_TEXTURE_KMT = 0x00000010, + EHT_D3D12_HEAP = 0x00000020, + EHT_D3D12_RESOURCE = 0x00000040, + EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, + }; + // const ILogicalDevice* getOriginDevice() const {return m_originDevice;} diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index b0c0ce05ed..04693456d7 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -39,6 +39,8 @@ class IDeviceMemoryBacked : public IBackendObject // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects bool skipHandleDestroy = false; + core::bitflag externalHandleTypes = IDeviceMemoryAllocation::EHT_NONE; + //! If you specify multiple queue family indices, then you're concurrent sharing inline bool isConcurrentSharing() const { diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 4222a22153..c1a703c993 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -639,6 +639,43 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return std::span(m_initData.qfamProperties->data(),m_initData.qfamProperties->data()+m_initData.qfamProperties->size()); } + enum class E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t + { + EEMF_NONE = 0x0, + EEMF_DEDICATED_ONLY_BIT = 0x1, + EEMF_EXPORTABLE_BIT = 0x2, + EEMF_IMPORTABLE_BIT = 0x4, + }; + + struct SExternalMemoryProperties + { + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE exportableTypes : 7; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE compatibleTypes : 7; + // TODO(kevin): This should actually be core::bitflag to be semantically correct. What should we do? Should we use bool for each flag instead of enum? + E_EXTERNAL_MEMORY_FEATURE_FLAGS features : 3; + bool operator == (SExternalMemoryProperties const& rhs) const = default; + }; + static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); + + SExternalMemoryProperties getExternalBufferProperties( + core::bitflag usages, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + usages &= ~asset::IBuffer::EUF_SYNTHEHIC_FLAGS_MASK; // mask out synthetic flags + + // TODO(kevinyu): Should we cached the properties like Atil does. If yes, needs mutex and mutable specifier. Class become not that simple anymore. + // { + // std::shared_lock lock(m_externalBufferPropertiesMutex); + // auto it = m_externalBufferProperties.find({ usage, handleType }); + // if (it != m_externalBufferProperties.end()) + // return it->second; + // } + // + // std::unique_lock lock(m_externalBufferPropertiesMutex); + // return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType); + return getExternalMemoryProperties_impl(usages, handleType); + } + struct SBufferFormatPromotionRequest { asset::E_FORMAT originalFormat = asset::EF_UNKNOWN; SFormatBufferUsages::SUsage usages = SFormatBufferUsages::SUsage(); @@ -683,6 +720,9 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable }; inline IPhysicalDevice(SInitData&& _initData) : m_initData(std::move(_initData)) {} + // External memory properties query + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + // ILogicalDevice creation bool validateLogicalDeviceCreation(const ILogicalDevice::SCreationParams& params) const; virtual core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) = 0; diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index da86d7c9d9..54e8543668 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1371,6 +1371,25 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart #undef RETURN_NULL_PHYSICAL_DEVICE +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + assert(!(handleType & (handleType - 1))); + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO, + .usage = static_cast(usages.value), + .handleType = static_cast(handleType) + }; + VkExternalBufferProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES }; + vkGetPhysicalDeviceExternalBufferProperties(m_vkPhysicalDevice, &info, &externalProps); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) { // We might alter it to account for dependancies. diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index c1552c88f1..5cb2556d6e 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,6 +109,8 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } + SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const; + core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; private: From f9b8b4fe51848661de1e89370ad838526d2114af Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 12 Mar 2026 21:15:17 +0700 Subject: [PATCH 15/83] Dedicated buffer and image --- include/nbl/asset/IBuffer.h | 2 ++ include/nbl/video/CVulkanDeviceMemoryBacked.h | 6 +++--- include/nbl/video/ILogicalDevice.h | 4 ++-- src/nbl/video/CVulkanBuffer.h | 2 +- src/nbl/video/CVulkanDeviceMemoryBacked.cpp | 6 +++--- src/nbl/video/CVulkanLogicalDevice.cpp | 8 ++++---- src/nbl/video/CVulkanLogicalDevice.h | 4 ++-- 7 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index 3a7cbb5983..92ffd3eb4d 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -42,6 +42,8 @@ class IBuffer : public IDescriptor, public core::IBuffer //! synthetic Nabla inventions // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, + + EUF_SYNTHEHIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index e6d17ddf3e..696d69058f 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -35,11 +35,11 @@ class CVulkanDeviceMemoryBacked : public Interface protected: // special constructor for when memory requirements are known up-front (so far only swapchains and internal forwarding here) CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const IDeviceMemoryBacked::SDeviceMemoryRequirements& _memReqs, const VkResource_t vkHandle); - CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const VkResource_t vkHandle) : - CVulkanDeviceMemoryBacked(dev,std::move(_creationParams),obtainRequirements(dev,vkHandle),vkHandle) {} + CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, bool dedicatedOnly, const VkResource_t vkHandle) : + CVulkanDeviceMemoryBacked(dev,std::move(_creationParams), obtainRequirements(dev, dedicatedOnly, vkHandle),vkHandle) {} private: - static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle); + static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle); core::smart_refctd_ptr m_memory = nullptr; size_t m_offset = 0u; diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 756b417c79..b12f4be333 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -1129,9 +1129,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual bool bindBufferMemory_impl(const uint32_t count, const SBindBufferMemoryInfo* pInfos) = 0; virtual bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) = 0; - virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) = 0; - virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) = 0; + virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; diff --git a/src/nbl/video/CVulkanBuffer.h b/src/nbl/video/CVulkanBuffer.h index 4596981c2a..944d7db205 100644 --- a/src/nbl/video/CVulkanBuffer.h +++ b/src/nbl/video/CVulkanBuffer.h @@ -16,7 +16,7 @@ class CVulkanBuffer : public CVulkanDeviceMemoryBacked using base_t = CVulkanDeviceMemoryBacked; public: - inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, const VkBuffer buffer) : base_t(dev,std::move(creationParams),buffer) {} + inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly, const VkBuffer buffer) : base_t(dev, std::move(creationParams), dedicatedOnly, buffer) {} void setObjectDebugName(const char* label) const override; diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 90b2993cb3..39c0efae19 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -6,7 +6,7 @@ namespace nbl::video { template -IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle) +IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle) { const std::conditional_t vk_memoryRequirementsInfo = { IsImage ? VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2:VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,nullptr,vkHandle @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) +core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR @@ -319,7 +319,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUB VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(creationParams),vk_buffer); + return core::make_smart_refctd_ptr(this, std::move(creationParams), dedicatedOnly, vk_buffer); } core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) @@ -338,7 +338,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_im return nullptr; } -core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params) +core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) { const bool hasStencil = asset::isDepthOrStencilFormat(params.format) && !asset::isDepthOnlyFormat(params.format); VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, nullptr }; @@ -377,7 +377,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma VkImage vk_image; if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(params),vk_image); + return core::make_smart_refctd_ptr(this, std::move(params), dedicatedOnly, vk_image); } core::smart_refctd_ptr CVulkanLogicalDevice::createImageView_impl(IGPUImageView::SCreationParams&& params) diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index e77386cb34..8f43a6783a 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -110,9 +110,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) override; // descriptor creation - core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) override; + core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) override; core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) override; - core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) override; + core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) override; core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) override; VkAccelerationStructureKHR createAccelerationStructure(const IGPUAccelerationStructure::SCreationParams& params, const VkAccelerationStructureTypeKHR type, const VkAccelerationStructureMotionInfoNV* motionInfo=nullptr); inline core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) override From a2357e2d8ec16bef81c9b4763964ce3db27c9bb3 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Mar 2026 10:37:50 +0700 Subject: [PATCH 16/83] External Memory Feature flags should not be enum class --- include/nbl/video/IPhysicalDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index c1a703c993..2ae58f22e3 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -639,7 +639,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return std::span(m_initData.qfamProperties->data(),m_initData.qfamProperties->data()+m_initData.qfamProperties->size()); } - enum class E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t + enum E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t { EEMF_NONE = 0x0, EEMF_DEDICATED_ONLY_BIT = 0x1, From 0d9c3d81f2b6681c28a7a00e54eb823260f7f4c1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Mar 2026 11:56:17 +0700 Subject: [PATCH 17/83] External Vulkan Buffer Creation --- include/nbl/video/ILogicalDevice.h | 16 +------------ src/nbl/video/CVulkanLogicalDevice.cpp | 8 ++++++- src/nbl/video/ILogicalDevice.cpp | 32 ++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 16 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index b12f4be333..1c7393bb57 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -331,21 +331,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams) - { - const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; - if (creationParams.size>maxSize) - { - m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit (%u)!",system::ILogger::ELL_ERROR,creationParams.size,this,maxSize); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Buffer, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - return createBuffer_impl(std::move(creationParams)); - } + inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 9bf85a5b1e..32e9ac2022 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -301,9 +301,15 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { + + VkExternalMemoryBufferCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO, + .handleTypes = creationParams.externalHandleTypes.value, + }; + VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR - vk_createInfo.pNext = nullptr; + vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage); diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 7958efa5c0..01e49a26d2 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -298,6 +298,38 @@ bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyIndex, asse return true; } +core::smart_refctd_ptr ILogicalDevice::createBuffer(IGPUBuffer::SCreationParams&& creationParams) +{ + const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; + if (creationParams.size > maxSize) + { + m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!", system::ILogger::ELL_ERROR, creationParams.size, this, maxSize); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + return createBuffer_impl(std::move(creationParams), dedicatedOnly); +} IQueue::RESULT ILogicalDevice::waitIdle() { From 89f5ae54224aa619adbafe275f68f3557f4b87df Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 13 Mar 2026 14:44:19 +0700 Subject: [PATCH 18/83] Temporary enable compile with cuda flag --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ba3410075..e0068b002a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,7 +70,8 @@ else() message(STATUS "Vulkan SDK is not found") endif() -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) +# TODO(kevinyu): Turn off this flag after I finish developing the PR. +option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" ON) if(NBL_COMPILE_WITH_CUDA) find_package(CUDAToolkit REQUIRED) From 152830f613e8ce1cf7114babc450aa8544f8fb8f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 14 Mar 2026 12:18:59 +0700 Subject: [PATCH 19/83] Update examples_tests submodule to vk_cuda interop demo branch --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 8f045a1c27..b8abd200a1 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8f045a1c27a198f8542456378f865032765378b8 +Subproject commit b8abd200a1a83ce4592f7ad3290d07ae02b4f538 From ea3b49b188504be1b13dad19a8751d762beb2aed Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:35:26 +0700 Subject: [PATCH 20/83] External memory allocation --- include/nbl/video/EApiType.h | 8 ++ include/nbl/video/IDeviceMemoryAllocation.h | 66 +++++++---- include/nbl/video/IDeviceMemoryAllocator.h | 46 ++++++-- src/nbl/video/CVulkanLogicalDevice.cpp | 122 ++++++++++++++++++-- src/nbl/video/CVulkanMemoryAllocation.cpp | 9 +- src/nbl/video/CVulkanMemoryAllocation.h | 7 +- src/nbl/video/IDeviceMemoryAllocation.cpp | 2 +- src/nbl/video/utilities/CAssetConverter.cpp | 13 +-- 8 files changed, 214 insertions(+), 59 deletions(-) diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index e670dc90d8..3e86c8d040 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,6 +13,14 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; +using ExternalHandleType = +#ifdef _WIN32 +void* +#else +int +#endif +; + } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 46f77975fb..8de6bd4fa8 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -24,6 +24,8 @@ We only support persistently mapped buffers with ARB_buffer_storage. Please don't ask us to support Buffer Orphaning. */ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { + friend class IDeviceMemoryAllocator; + friend class ILogicalDevice; public: //! Access flags for how the application plans to use mapped memory (if any) /** When you create the memory you can allow for it to be mapped (be given a pointer) @@ -88,26 +90,26 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted E_API_TYPE getAPIType() const; //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource. - inline bool isDedicated() const {return m_dedicated;} + inline bool isDedicated() const {return m_params.dedicated;} //! Returns the size of the memory allocation - inline size_t getAllocationSize() const {return m_allocationSize;} + inline size_t getAllocationSize() const {return m_params.allocationSize;} //! - inline core::bitflag getAllocateFlags() const { return m_allocateFlags; } + inline core::bitflag getAllocateFlags() const { return m_params.allocateFlags; } //! - inline core::bitflag getMemoryPropertyFlags() const { return m_memoryPropertyFlags; } + inline core::bitflag getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr) - inline bool isMappable() const {return m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)||m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} + inline bool isMappable() const {return m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)|| m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} //! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor /** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more) for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */ // TODO: Visible is a misnomer, collides with Vulkan memory model nomenclature where visibility only concerns reads, where as this is both read and write (visibility and availability) inline bool haveToMakeVisible() const { - return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); + return !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); } //! @@ -123,9 +125,9 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { if (isCurrentlyMapped()) return nullptr; - if(accessHint.hasFlags(EMCAF_READ) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) + if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) return nullptr; - if(accessHint.hasFlags(EMCAF_WRITE) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) + if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) return nullptr; m_mappedPtr = reinterpret_cast(map_impl(range,accessHint)); if (m_mappedPtr) @@ -166,29 +168,53 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Constant variant of getMappedPointer inline const void* getMappedPointer() const { return m_mappedPtr; } + struct SInfo + { + uint64_t allocationSize = 0; + core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; + // Handle Type for external resources + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE + ExternalHandleType externalHandle = 0; + }; + + struct SCreationParams: SInfo + { + core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; + const bool dedicated = false; + }; + + inline const SCreationParams& getCreationParams() const { return m_params; } + protected: - inline IDeviceMemoryAllocation( - const ILogicalDevice* const originDevice, const size_t _size, const core::bitflag allocateFlags, const core::bitflag memoryPropertyFlags, const bool dedicated - ) : m_originDevice(originDevice), m_allocationSize(_size), m_allocateFlags(allocateFlags), m_memoryPropertyFlags(memoryPropertyFlags), m_dedicated(dedicated) {} + inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) + { + m_postDestroyCleanup = std::move(cleanup); + } + + IDeviceMemoryAllocation( + const ILogicalDevice* originDevice, SCreationParams&& params = {}) + : m_originDevice(originDevice) + , m_params(std::move(params)) + , m_mappedPtr(nullptr) + , m_mappedRange{ 0, 0 } + , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS) + {} virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; virtual bool unmap_impl() = 0; - - const ILogicalDevice* const m_originDevice; - const size_t m_allocationSize; + const ILogicalDevice* m_originDevice = nullptr; + SCreationParams m_params = {}; uint8_t* m_mappedPtr = nullptr; MemoryRange m_mappedRange = {}; core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - const core::bitflag m_allocateFlags; - const core::bitflag m_memoryPropertyFlags; - const bool m_dedicated; + std::unique_ptr m_postDestroyCleanup = nullptr; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) } // end namespace nbl::video -#endif - - +#endif \ No newline at end of file diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index e85eec12a0..9201d3f849 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -15,11 +15,9 @@ class NBL_API2 IDeviceMemoryAllocator // right now we only support this interface handing out memory for one device or group virtual ILogicalDevice* getDeviceForAllocations() const = 0; - struct SAllocateInfo + struct SAllocateInfo : IDeviceMemoryAllocation::SInfo { - size_t size : 54 = 0ull; - size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS - size_t memoryTypeIndex : 5 = 0u; + size_t memoryTypeIndex = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. }; @@ -45,8 +43,15 @@ class NBL_API2 IDeviceMemoryAllocator class IMemoryTypeIterator { public: - IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) - : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs) {} + IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + ExternalHandleType handle) : + m_allocateFlags(static_cast(allocateFlags.value)), + m_reqs(reqs), + m_handleType(handleType), + m_handle(handle) + {} static inline uint32_t end() {return 32u;} @@ -59,10 +64,12 @@ class NBL_API2 IDeviceMemoryAllocator inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) { SAllocateInfo ret; - ret.size = m_reqs.size; - ret.flags = m_allocateFlags; + ret.allocationSize = m_reqs.size; + ret.allocateFlags = core::bitflag(m_allocateFlags); ret.memoryTypeIndex = dereference(); ret.dedication = dedication; + ret.externalHandleType = m_handleType; + ret.externalHandle = m_handle; return ret; } @@ -75,13 +82,21 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; + ExternalHandleType m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB class DefaultMemoryTypeIterator : public IMemoryTypeIterator { public: - DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) : IMemoryTypeIterator(reqs, allocateFlags) + DefaultMemoryTypeIterator( + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + ExternalHandleType handle + ) : + IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } @@ -106,15 +121,22 @@ class NBL_API2 IDeviceMemoryAllocator template inline SAllocation allocate( - const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, IDeviceMemoryBacked* dedication=nullptr, - const core::bitflag allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE) + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + IDeviceMemoryBacked* dedication = nullptr, + const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, + ExternalHandleType externalHandle = {}, + std::unique_ptr&& postDestroyCleanup = nullptr) { - for(memory_type_iterator_t memTypeIt(reqs, allocateFlags); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); auto allocation = allocate(allocateInfo); if (allocation.isValid()) + { + allocation.memory->setPostDestroyCleanup(std::move(postDestroyCleanup)); return allocation; + } } return {}; } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 32e9ac2022..6d5f896765 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -136,26 +136,85 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } +ExternalHandleType DupeHandle(uint64_t pid, ExternalHandleType handle) +{ +#ifdef _WIN32 + HANDLE re = 0; + + HANDLE cur = GetCurrentProcess(); + HANDLE src = pid ? OpenProcess(GENERIC_ALL, false, pid) : cur; + + if (!DuplicateHandle(src, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return 0; + + CloseHandle(src); + return re; +#endif + return handle; +} IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { - IDeviceMemoryAllocator::SAllocation ret = {}; if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) - return ret; + return {}; - const core::bitflag allocateFlags(info.flags); VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr }; { - if (allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) + if (info.allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) vk_allocateFlagsInfo.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now } VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr}; + +#ifdef _WIN32 + VkImportMemoryWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .handle = info.externalHandle + }; + + VkExportMemoryWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL, + }; +#else + VkImportMemoryFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .fd = (int)info.externalHandle, + }; +#endif + + VkExportMemoryAllocateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif + .handleTypes = static_cast(info.externalHandleType), + }; + + const void** pNext = &vk_allocateFlagsInfo.pNext; + + if (info.externalHandleType) + { + if (info.externalHandle) //importing + { + auto duped = DupeHandle(0, info.externalHandle); + const_cast(info.externalHandle) = duped; + *pNext = &importInfo; + } + else // exporting + *pNext = &exportInfo; + pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext; + } + if(info.dedication) { // VK_KHR_dedicated_allocation is in core 1.1, no querying for support needed static_assert(MinimumVulkanApiVersion >= VK_MAKE_API_VERSION(0,1,1,0)); - vk_allocateFlagsInfo.pNext = &vk_dedicatedInfo; + *pNext = &vk_dedicatedInfo; + pNext = &vk_dedicatedInfo.pNext; + switch (info.dedication->getObjectType()) { case IDeviceMemoryBacked::EOT_BUFFER: @@ -166,22 +225,65 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca break; default: assert(false); - return ret; + return {}; break; } } VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo}; - vk_allocateInfo.allocationSize = info.size; + vk_allocateInfo.allocationSize = info.allocationSize; vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; VkDeviceMemory vk_deviceMemory; auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory); if (vk_res!=VK_SUCCESS) - return ret; + return {}; + + const bool exported = info.externalHandleType && !info.externalHandle; + + if (exported) + { +#ifdef _WIN32 + VkMemoryGetWin32HandleInfoKHR +#else + VkMemoryGetFdInfoKHR +#endif + handleInfo = { .sType = +#ifdef _WIN32 + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#else + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#endif + .memory = vk_deviceMemory, + .handleType = static_cast(info.externalHandleType), + }; + + /* + For handle types defined as NT handles, + the handles returned by vkGetMemoryWin32HandleKHR are owned by the application + and hold a reference to their payload. To avoid leaking resources, + the application must release ownership of them + using the CloseHandle system call when they are no longer needed. + */ + + if (VK_SUCCESS != m_devf.vk. +#ifdef _WIN32 + vkGetMemoryWin32HandleKHR +#else + vkGetMemoryFdKHR +#endif + (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + { + m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); + return {}; + } + + } // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; - ret.memory = core::make_smart_refctd_ptr(this,info.size,allocateFlags,memoryPropertyFlags,info.dedication,vk_deviceMemory); + CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; + IDeviceMemoryAllocator::SAllocation ret = {}; + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) { @@ -554,7 +656,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDesc vkDescSetLayoutBinding.stageFlags = getVkShaderStageFlagsFromShaderStage(binding.stageFlags); vkDescSetLayoutBinding.pImmutableSamplers = nullptr; - if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER or binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) and binding.immutableSamplers and binding.count) + if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER || binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) && binding.immutableSamplers && binding.count) { // If descriptorType is VK_DESCRIPTOR_TYPE_SAMPLER or VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, and descriptorCount is not 0 and pImmutableSamplers is not NULL: // pImmutableSamplers must be a valid pointer to an array of descriptorCount valid VkSampler handles. diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 5a4dfd5ff5..8f50c29939 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -4,11 +4,10 @@ namespace nbl::video { CVulkanMemoryAllocation::CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle -) : IDeviceMemoryAllocation(dev,size,flags,memoryPropertyFlags,isDedicated), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params +) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 470e914ae3..22e32142c0 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -15,10 +15,9 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation { public: CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } diff --git a/src/nbl/video/IDeviceMemoryAllocation.cpp b/src/nbl/video/IDeviceMemoryAllocation.cpp index 058f391de1..5f05e8d928 100644 --- a/src/nbl/video/IDeviceMemoryAllocation.cpp +++ b/src/nbl/video/IDeviceMemoryAllocation.cpp @@ -14,7 +14,7 @@ IDeviceMemoryAllocation::MemoryRange IDeviceMemoryAllocation::alignNonCoherentRa { const auto alignment = m_originDevice->getPhysicalDevice()->getLimits().nonCoherentAtomSize; range.offset = core::alignDown(range.offset,alignment); - range.length = core::min(core::alignUp(range.length,alignment),m_allocationSize); + range.length = core::min(core::alignUp(range.length,alignment),m_params.allocationSize); return range; } diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index d7f2d7dbbc..4a5890c4b7 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2459,12 +2459,11 @@ class MetaDeviceMemoryAllocator final failures.reserve(binItemCount); // ... using allocate_flags_t = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS; - IDeviceMemoryAllocator::SAllocateInfo info = { - .size = 0xdeadbeefBADC0FFEull, // set later - .flags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT:allocate_flags_t::EMAF_NONE, - .memoryTypeIndex = memTypeIx, - .dedication = nullptr - }; + IDeviceMemoryAllocator::SAllocateInfo info; + info.allocationSize = 0xdeadbeefBADC0FFEull; // set later + info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; + info.memoryTypeIndex = memTypeIx; + info.dedication = nullptr; // allocate in progression of combined allocations, while trying allocate as much as possible in a single allocation auto binItemsIt = binItems.begin(); for (auto firstOffsetIt=offsetsTmp.begin(); firstOffsetIt!=offsetsTmp.end(); ) @@ -2473,7 +2472,7 @@ class MetaDeviceMemoryAllocator final const size_t combinedCount = std::distance(firstOffsetIt,nextOffsetIt); const size_t lastIx = combinedCount-1; // if we take `combinedCount` starting at `firstItem` their allocation would need this size - info.size = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; + info.allocationSize = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; auto allocation = m_allocator->allocate(info); if (allocation.isValid()) { From 77b92ab7e66fbb659491248bdaa562ae59041fc2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:48:57 +0700 Subject: [PATCH 21/83] Fix indentation on CAssetConverter.cpp --- src/nbl/video/utilities/CAssetConverter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 4a5890c4b7..06bab99dd4 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2461,7 +2461,7 @@ class MetaDeviceMemoryAllocator final using allocate_flags_t = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS; IDeviceMemoryAllocator::SAllocateInfo info; info.allocationSize = 0xdeadbeefBADC0FFEull; // set later - info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; + info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; info.memoryTypeIndex = memTypeIx; info.dedication = nullptr; // allocate in progression of combined allocations, while trying allocate as much as possible in a single allocation From 68f740fa813fa2a712ed53d2d5c04462e0401a67 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:49:21 +0700 Subject: [PATCH 22/83] Update jitify submodule --- 3rdparty/jitify | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/jitify b/3rdparty/jitify index 0d6dbd8ccd..1a0ca0e837 160000 --- a/3rdparty/jitify +++ b/3rdparty/jitify @@ -1 +1 @@ -Subproject commit 0d6dbd8ccd07e6bfc811d363a54912dfc6d4799a +Subproject commit 1a0ca0e837405506f3b8f7883bacb71c20d86d96 From 1c93a9157d5fa9d02415ebaa843749664e8ec209 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 12:52:07 +0700 Subject: [PATCH 23/83] External memory allocation cleanup --- src/nbl/video/CVulkanMemoryAllocation.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 8f50c29939..c817213700 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -11,6 +11,11 @@ CVulkanMemoryAllocation::CVulkanMemoryAllocation( CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { + if (m_params.externalHandle) + { + bool re = CloseHandle(getCreationParams().externalHandle); + assert(re); + } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); } From ae0e177f14e5b57e313a09b43edeb131aded2eaa Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 16 Mar 2026 18:01:09 +0700 Subject: [PATCH 24/83] Implement proper CCUDADevice destructor. --- include/nbl/video/CCUDADevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index b204b98b23..047680ba9c 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -183,7 +183,7 @@ class CCUDADevice : public core::IReferenceCounted protected: friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); - ~CCUDADevice() = default; + ~CCUDADevice(); std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index bf96c6e78d..79ba9c2c7a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -143,6 +143,11 @@ CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* } #endif +CCUDADevice::~CCUDADevice() +{ + m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); +} + } #endif // _NBL_COMPILE_WITH_CUDA_ From c83942a771a812ad5664854fc9462f5298e7f4e8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Mar 2026 18:43:18 +0700 Subject: [PATCH 25/83] Implementation of Shared memory between vulkan and cuda --- include/nbl/video/CCUDADevice.h | 24 +++++- include/nbl/video/CCUDASharedMemory.h | 74 ++++++++++++++++++ src/nbl/CMakeLists.txt | 1 + src/nbl/video/CCUDADevice.cpp | 102 ++++++++++++++++++++++++- src/nbl/video/CCUDASharedMemory.cpp | 105 ++++++++++++++++++++++++++ 5 files changed, 304 insertions(+), 2 deletions(-) create mode 100644 include/nbl/video/CCUDASharedMemory.h create mode 100644 src/nbl/video/CCUDASharedMemory.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 047680ba9c..62c3360d1e 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -6,6 +6,7 @@ #include "nbl/video/IPhysicalDevice.h" +#include "nbl/video/CCUDASharedMemory.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -26,7 +27,22 @@ class CCUDAHandler; class CCUDADevice : public core::IReferenceCounted { - public: + public: +#ifdef _WIN32 + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; +#else + static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD; + static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif + + struct SCUDACleaner : video::ICleanup + { + core::smart_refctd_ptr resource; + SCUDACleaner(core::smart_refctd_ptr resource) + : resource(std::move(resource)) + {} + }; enum E_VIRTUAL_ARCHITECTURE { EVA_30, @@ -180,6 +196,11 @@ class CCUDADevice : public core::IReferenceCounted static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); #endif + CUdevice getInternalObject() const { return m_handle; } + const CCUDAHandler* getHandler() const { return m_handler.get(); } + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + size_t roundToGranularity(CUmemLocationType location, size_t size) const; + CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); protected: friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); @@ -193,6 +214,7 @@ class CCUDADevice : public core::IReferenceCounted core::smart_refctd_ptr m_handler; CUdevice m_handle; CUcontext m_context; + size_t m_allocationGranularity[4]; }; } diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h new file mode 100644 index 0000000000..f133dadd81 --- /dev/null +++ b/include/nbl/video/CCUDASharedMemory.h @@ -0,0 +1,74 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ + + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class CCUDAMemoryMapping: public core::IReferenceCounted +{ +}; + +class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted +{ +public: + friend class CCUDADevice; + + CUdeviceptr getDeviceptr() const { return m_params.ptr; } + + struct SCreationParams + { + size_t size; + uint32_t alignment; + CUmemLocationType location; + }; + + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + CUdeviceptr ptr; + union + { + void* osHandle; + int fd; + }; + }; + + const SCreationParams& getCreationParams() const { return m_params; } + + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + core::smart_refctd_ptr exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; + +protected: + + CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) + : m_device(std::move(device)) + , m_params(std::move(params)) + {} + ~CCUDASharedMemory() override; + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index a680a19eab..09d3587e1d 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -294,6 +294,7 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp + video/CCUDASharedMemory.cpp ) set(NBL_SCENE_SOURCES diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 79ba9c2c7a..9dc3908b6b 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -18,7 +18,8 @@ CCUDADevice::CCUDADevice( m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture), m_handle(_device), - m_handler(std::move(_handler)) + m_handler(std::move(_handler)), + m_allocationGranularity{} { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); @@ -31,6 +32,20 @@ CCUDADevice::CCUDADevice( assert(CUDA_SUCCESS == re); re = cu.pcuCtxSetCurrent(m_context); assert(CUDA_SUCCESS == re); + + for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) + { + uint32_t metaData[16] = { 48 }; + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = {.type = static_cast(i), .id = m_handle }, + .win32HandleMetaData = metaData, + }; + auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); + + assert(CUDA_SUCCESS == re); + } } @@ -143,6 +158,91 @@ CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* } #endif +size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const +{ + return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; +} + +CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +{ + auto& cu = m_handler->getCUDAFunctionTable(); + + CUdeviceptr ptr = 0; + if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) + return err; + + if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) + { + cu.pcuMemAddressFree(ptr, size); + return err; + } + + CUmemAccessDesc accessDesc = { + .location = { .type = location, .id = m_handle }, + .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, + }; + + if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) + { + cu.pcuMemUnmap(ptr, size); + cu.pcuMemAddressFree(ptr, size); + return err; + } + + *outPtr = ptr; + + return CUDA_SUCCESS; +} + +CUresult CCUDADevice::createSharedMemory( + core::smart_refctd_ptr* outMem, + CCUDASharedMemory::SCreationParams&& inParams) +{ + if (!outMem) + return CUDA_ERROR_INVALID_VALUE; + + CCUDASharedMemory::SCachedCreationParams params = { inParams }; + + auto& cu = m_handler->getCUDAFunctionTable(); + + uint32_t metaData[16] = { 48 }; + + CUmemAllocationProp prop = { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = params.location, .id = m_handle }, + .win32HandleMetaData = metaData, + }; + + params.granularSize = roundToGranularity(params.location, params.size); + + CUmemGenericAllocationHandle mem; + if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) + return err; + + if (auto err = cu.pcuMemExportToShareableHandle(¶ms.osHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) + { + cu.pcuMemRelease(mem); + return err; + } + + if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + { + CloseHandle(params.osHandle); + cu.pcuMemRelease(mem); + return err; + } + + if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) + { + CloseHandle(params.osHandle); + return err; + } + + *outMem = core::smart_refctd_ptr(new CCUDASharedMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + + return CUDA_SUCCESS; +} CCUDADevice::~CCUDADevice() { m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp new file mode 100644 index 0000000000..93ab6f4c48 --- /dev/null +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -0,0 +1,105 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ + +core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +{ + auto pd = device->getPhysicalDevice(); + uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; + uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); + + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; + case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; + // TODO(Atil): Figure out how to handle these + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + default: break; + } + + IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; + req.size = m_params.granularSize; + req.memoryTypeBits = memoryTypeBits; + req.prefersDedicatedAllocation = nullptr != dedication; + req.requiresDedicatedAllocation = nullptr != dedication; + + return device->allocate(req, + dedication, + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + m_params.osHandle, + std::make_unique(core::smart_refctd_ptr(this))).memory; +} + +#if 0 +core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag usage) const +{ + if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) + return nullptr; + + auto buf = device->createBuffer({{ + .size = m_params.granularSize, + .usage = usage }, {{ + .postDestroyCleanup = std::make_unique(core::smart_refctd_ptr(this)), + .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + .externalHandle = m_params.osHandle + }}}); + + auto req = buf->getMemoryReqs(); + auto pd = device->getPhysicalDevice(); + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break; + case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break; + // TODO(Atil): Figure out how to handle these + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + default: break; + } + + if (!device->allocate(req, buf.get()).isValid()) + return nullptr; + + return buf; +} + +#endif + +core::smart_refctd_ptr CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const +{ + if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) + return nullptr; + + // auto img = device->createImage({ + // std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }}, + // IGPUImage::ET_LINEAR, + // IGPUImage::EL_PREINITIALIZED, + // }); + // + // if (exportAsMemory(device, img.get())) + // return img; + + return nullptr; +} + +CCUDASharedMemory::~CCUDASharedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + + CUresult re[] = { + cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), + }; + CloseHandle(m_params.osHandle); + +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file From 2e457029b6318732a22f958b2422317269d296e1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Mar 2026 18:43:57 +0700 Subject: [PATCH 26/83] Add NBL_API2 modifier to CCUDADevice --- include/nbl/video/CCUDADevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 62c3360d1e..7668bb2ea5 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -25,7 +25,7 @@ namespace nbl::video { class CCUDAHandler; -class CCUDADevice : public core::IReferenceCounted +class NBL_API2 CCUDADevice : public core::IReferenceCounted { public: #ifdef _WIN32 From 741252f1964ba2f807f260762fbfe37a63b259f9 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 17 Mar 2026 18:46:09 +0700 Subject: [PATCH 27/83] Implementation of Shared semaphore between Vulkan and CUDA --- include/nbl/video/CCUDADevice.h | 6 +++ include/nbl/video/CCUDASharedSemaphore.h | 52 ++++++++++++++++++++++++ include/nbl/video/ILogicalDevice.h | 2 +- include/nbl/video/ISemaphore.h | 34 +++++++++++++++- src/nbl/CMakeLists.txt | 1 + src/nbl/video/CCUDADevice.cpp | 26 ++++++++++++ src/nbl/video/CCUDASharedSemaphore.cpp | 19 +++++++++ src/nbl/video/CVulkanLogicalDevice.cpp | 40 +++++++++++++++--- src/nbl/video/CVulkanLogicalDevice.h | 2 +- src/nbl/video/CVulkanSemaphore.cpp | 9 +++- src/nbl/video/CVulkanSemaphore.h | 4 +- 11 files changed, 183 insertions(+), 12 deletions(-) create mode 100644 include/nbl/video/CCUDASharedSemaphore.h create mode 100644 src/nbl/video/CCUDASharedSemaphore.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 7668bb2ea5..e80bd18138 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -7,6 +7,7 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDASharedSemaphore.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -201,7 +202,12 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + protected: + CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); ~CCUDADevice(); diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h new file mode 100644 index 0000000000..6c69f75438 --- /dev/null +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -0,0 +1,52 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted +{ +public: + friend class CCUDADevice; + + CUexternalSemaphore getInternalObject() const { return m_handle; } + +protected: + + CCUDASharedSemaphore(core::smart_refctd_ptr device, + core::smart_refctd_ptr src, + CUexternalSemaphore semaphore, + ExternalHandleType osHandle) + : m_device(std::move(device)) + , m_src(std::move(m_src)) + , m_handle(semaphore) + , m_osHandle(osHandle) + {} + ~CCUDASharedSemaphore() override; + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalSemaphore m_handle; + ExternalHandleType m_osHandle; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 1c7393bb57..9f2c589172 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -162,7 +162,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe IQueue::RESULT waitIdle(); //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) = 0; + virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, SCreationParams&& creationParams) = 0; // Waits for max timeout amout of time for the semaphores to reach a specific counter value // DOES NOT implicitly trigger Queue-refcount-resource release because of two reasons: // - the events may trigger loads of resource releases causing extra processing, whereas our `timeout` could be quite small diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index d4fbdd1756..67a093f9d3 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -15,6 +15,34 @@ namespace nbl::video class ISemaphore : public IBackendObject { public: + + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0x00000000, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D12_FENCE = 0x00000008, + EHT_SYNC_FD = 0x00000010, + }; + + //! + struct SCreationParams + { + // A Pre-Destroy-Step is called out just before a `vkDestory` or `glDelete`, this is only useful for "imported" resources + std::unique_ptr preDestroyCleanup = nullptr; + // A Post-Destroy-Step is called in this class' destructor, this is only useful for "imported" resources + std::unique_ptr postDestroyCleanup = nullptr; + // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects + bool skipHandleDestroy = false; + // Handle Type for external resources + core::bitflag externalHandleTypes = EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE + ExternalHandleType externalHandle = nullptr; + }; + // basically a pool function virtual uint64_t getCounterValue() const = 0; @@ -146,9 +174,13 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; + const SCreationParams& getCreationParams() const { return m_creationParams; } + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev) : IBackendObject(std::move(dev)) {} + inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} virtual ~ISemaphore() = default; + + SCreationParams m_creationParams; }; } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 09d3587e1d..bbec1b1691 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -294,6 +294,7 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp + video/CCUDASharedSemaphore.cpp video/CCUDASharedMemory.cpp ) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 9dc3908b6b..b7313b80bf 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -243,6 +243,32 @@ CUresult CCUDADevice::createSharedMemory( return CUDA_SUCCESS; } + +CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +{ + if (!sema || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + + auto& cu = m_handler->getCUDAFunctionTable(); + auto handleType = sema->getCreationParams().externalHandleTypes.value; + auto handle = sema->getCreationParams().externalHandle; + + if (!handleType || !handle) + return CUDA_ERROR_INVALID_VALUE; + + CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { + .type = static_cast(handleType), + .handle = {.win32 = {.handle = handle }}, + }; + + CUexternalSemaphore cusema; + if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + return err; + + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, handle), core::dont_grab); + return CUDA_SUCCESS; +} + CCUDADevice::~CCUDADevice() { m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp new file mode 100644 index 0000000000..049f93ac13 --- /dev/null +++ b/src/nbl/video/CCUDASharedSemaphore.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDASharedSemaphore.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ +CCUDASharedSemaphore::~CCUDASharedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + cu.pcuDestroyExternalSemaphore(m_handle); + CloseHandle(m_osHandle); +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 6d5f896765..cd49be13cd 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -56,10 +56,24 @@ CVulkanLogicalDevice::CVulkanLogicalDevice(core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams) { + + // TODO(kevin) : Handle importing external semaphore into Vulkan + // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; + + VkExportSemaphoreWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, + .dwAccess = /*DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE*/0x80000000L | 1 + }; + VkExportSemaphoreCreateInfo exportInfo = { + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, + &handleInfo, + static_cast(creationParams.externalHandleTypes.value) + }; + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; - type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR + type.pNext = creationParams.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR, or VkSemaphoreTypeCreateInfo type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; type.initialValue = initialValue; @@ -67,11 +81,27 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u createInfo.flags = static_cast(0); // flags must be 0 VkSemaphore semaphore; - if (m_devf.vk.vkCreateSemaphore(m_vkdev,&createInfo,nullptr,&semaphore)==VK_SUCCESS) - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this),semaphore); - else + if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; + + if (creationParams.externalHandleTypes.value) + { + VkSemaphoreGetWin32HandleInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = semaphore, + .handleType = static_cast(creationParams.externalHandleTypes.value), + }; + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &creationParams.externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); + return nullptr; + } + } + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore); + } + ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) { using retval_t = ISemaphore::WAIT_RESULT; diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index 8f43a6783a..09213f28db 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -53,7 +53,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice CVulkanLogicalDevice(core::smart_refctd_ptr&& api, renderdoc_api_t* const rdoc, const IPhysicalDevice* const physicalDevice, const VkDevice vkdev, const SCreationParams& params); // sync stuff - core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) override; + core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) override; ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 071c4b2843..792d1f27f1 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -7,8 +7,13 @@ namespace nbl::video CVulkanSemaphore::~CVulkanSemaphore() { - const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - vulkanDevice->getFunctionTable()->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + m_creationParams.preDestroyCleanup = nullptr; + if (!m_creationParams.skipHandleDestroy) + { + const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); + auto* vk = vulkanDevice->getFunctionTable(); + vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + } } uint64_t CVulkanSemaphore::getCounterValue() const diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 9290110d8d..cc5d15d3f4 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,8 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore) + : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore) {} ~CVulkanSemaphore(); uint64_t getCounterValue() const override; From fe75ce017e64279203c9aaa7a5c1e79a5264a691 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:31:26 +0700 Subject: [PATCH 28/83] Update to CUDA Toolkit version 13.0+ --- include/nbl/video/CCUDAHandler.h | 4 ++-- src/nbl/video/CCUDADevice.cpp | 2 +- src/nbl/video/CCUDAHandler.cpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index ef040f5536..9de55914b5 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -39,7 +39,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted // using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v2 + ,cuCtxCreate_v4 ,cuDevicePrimaryCtxRetain ,cuDevicePrimaryCtxRelease ,cuDevicePrimaryCtxSetFlags @@ -62,7 +62,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuDeviceGet ,cuDeviceGetAttribute ,cuDeviceGetLuid - ,cuDeviceGetUuid + ,cuDeviceGetUuid_v2 ,cuDeviceTotalMem_v2 ,cuDeviceGetName ,cuDriverGetVersion diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index b7313b80bf..3b8ea3bee8 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -28,7 +28,7 @@ CCUDADevice::CCUDADevice( auto& cu = m_handler->getCUDAFunctionTable(); - CUresult re = cu.pcuCtxCreate_v2(&m_context, 0, m_handle); + CUresult re = cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle); assert(CUDA_SUCCESS == re); re = cu.pcuCtxSetCurrent(m_context); assert(CUDA_SUCCESS == re); diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 0eba770c89..c1044dd894 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -42,7 +42,7 @@ CCUDAHandler::CCUDAHandler( continue; CUuuid uuid = {}; - if (m_cuda.pcuDeviceGetUuid(&uuid, handle) != CUDA_SUCCESS) + if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) continue; m_availableDevices.emplace_back(handle, uuid); From 78fc0df8c1c15edb88c2f3b48467b296681a0ddc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:38:15 +0700 Subject: [PATCH 29/83] Fix external semaphore --- include/nbl/video/CCUDADevice.h | 5 ++++- include/nbl/video/ILogicalDevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 8 +++++++- src/nbl/video/CVulkanLogicalDevice.cpp | 6 +----- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index e80bd18138..3d40ebff25 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -194,7 +194,10 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); + static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, + + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); +uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); #endif CUdevice getInternalObject() const { return m_handle; } diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 9f2c589172..1dce5e7091 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -162,7 +162,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe IQueue::RESULT waitIdle(); //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) = 0; // Waits for max timeout amout of time for the semaphores to reach a specific counter value // DOES NOT implicitly trigger Queue-refcount-resource release because of two reasons: // - the events may trigger loads of resource releases causing extra processing, whereas our `timeout` could be quite small diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 3b8ea3bee8..ac25bb234a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -256,8 +256,14 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(handleType), + .type = EXTERNAL_SEMAPHORE_HANDLE_TYPE, .handle = {.win32 = {.handle = handle }}, }; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index cd49be13cd..dab9862964 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -62,13 +62,9 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u // TODO(kevin) : Handle importing external semaphore into Vulkan // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; - VkExportSemaphoreWin32HandleInfoKHR handleInfo = { - .sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR, - .dwAccess = /*DXGI_SHARED_RESOURCE_READ | DXGI_SHARED_RESOURCE_WRITE*/0x80000000L | 1 - }; VkExportSemaphoreCreateInfo exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, - &handleInfo, + nullptr, static_cast(creationParams.externalHandleTypes.value) }; From 5d19c5bd2be94d2bead2e6f0c6f35108f6495150 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:39:54 +0700 Subject: [PATCH 30/83] External image implementation --- include/nbl/video/ILogicalDevice.h | 16 +-------- include/nbl/video/IPhysicalDevice.h | 16 +++++++++ src/nbl/video/CVulkanLogicalDevice.cpp | 10 +++++- src/nbl/video/CVulkanPhysicalDevice.cpp | 39 ++++++++++++++++++++++ src/nbl/video/CVulkanPhysicalDevice.h | 4 ++- src/nbl/video/ILogicalDevice.cpp | 44 +++++++++++++++++++++++++ 6 files changed, 112 insertions(+), 17 deletions(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 1dce5e7091..d6d2f8530a 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -335,21 +335,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) - inline core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams) - { - if (!IGPUImage::validateCreationParameters(creationParams)) - { - m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs - return createImage_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams); // Create an ImageView that can actually be used by shaders (@see ICPUImageView) inline core::smart_refctd_ptr createImageView(IGPUImageView::SCreationParams&& params) { diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 2ae58f22e3..f8550debce 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -676,6 +676,21 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return getExternalMemoryProperties_impl(usages, handleType); } + struct SImageFormatInfo + { + asset::E_FORMAT format; + IGPUImage::E_TYPE type; + IGPUImage::TILING tiling; + core::bitflag usage; + core::bitflag flags; + }; + SExternalMemoryProperties getExternalImageProperties( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + return getExternalMemoryProperties_impl(info, handleType); + } + struct SBufferFormatPromotionRequest { asset::E_FORMAT originalFormat = asset::EF_UNKNOWN; SFormatBufferUsages::SUsage usages = SFormatBufferUsages::SUsage(); @@ -722,6 +737,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable // External memory properties query virtual SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; // ILogicalDevice creation bool validateLogicalDeviceCreation(const ILogicalDevice::SCreationParams& params) const; diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index dab9862964..c0df8fd9f4 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -488,7 +488,14 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast(fmt)); vk_formatListStruct.pViewFormats = vk_formatList.data(); + const bool external = params.externalHandleTypes.value; + VkExternalMemoryImageCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, + .handleTypes = params.externalHandleTypes.value, + }; + VkImageCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, &vk_formatListStruct }; + vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(params.flags.value); vk_createInfo.imageType = static_cast(params.type); vk_createInfo.format = getVkFormatFromFormat(params.format); @@ -506,7 +513,8 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = params.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices; - vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED:VK_IMAGE_LAYOUT_UNDEFINED; + // The Vulkan spec states: If the pNext chain includes a VkExternalMemoryImageCreateInfo or VkExternalMemoryImageCreateInfoNV structure whose handleTypes member is not 0, initialLayout must be VK_IMAGE_LAYOUT_UNDEFINED + vk_createInfo.initialLayout = external ? VK_IMAGE_LAYOUT_UNDEFINED : (params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED); VkImage vk_image; if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 54e8543668..64dcc24fc4 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1,5 +1,6 @@ #include "nbl/video/CVulkanPhysicalDevice.h" #include "nbl/video/CVulkanLogicalDevice.h" +#include "nbl/video/IGPUImage.h" namespace nbl::video { @@ -1390,6 +1391,44 @@ IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMem }; } +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + VkPhysicalDeviceExternalImageFormatInfo externalImageFormatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO, + .handleType = static_cast(handleType), + }; + + VkPhysicalDeviceImageFormatInfo2 formatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = &externalImageFormatInfo, + .format = getVkFormatFromFormat(info.format), + .type = static_cast(info.type), + .tiling = static_cast(info.tiling), + .usage = getVkImageUsageFlagsFromImageUsageFlags(info.usage.value, asset::isDepthOrStencilFormat(info.format)), + .flags = static_cast(info.flags.value), + }; + + VkExternalImageFormatProperties externalProps = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES, + }; + VkImageFormatProperties2 props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + .pNext = &externalProps, + }; + + auto re = vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &formatInfo, &props); + assert(VK_SUCCESS == re); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) { // We might alter it to account for dependancies. diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index 5cb2556d6e..40e0dd78fe 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,7 +109,9 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } - SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const; + SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; + + SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index 01e49a26d2..da883d3974 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -356,6 +356,50 @@ core::smart_refctd_ptr ILogicalDevice::createBufferView(const as return createBufferView_impl(underlying, _fmt); } +core::smart_refctd_ptr ILogicalDevice::createImage(IGPUImage::SCreationParams&& creationParams) +{ + if (!IGPUImage::validateCreationParameters(creationParams)) + { + m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); + return nullptr; + } + if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) + { + m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalImageProperties(IPhysicalDevice::SImageFormatInfo{ + .format = creationParams.format, + .type = creationParams.type, + .tiling = creationParams.tiling, + .usage = creationParams.usage, + .flags = creationParams.flags + }, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + + // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs + return createImage_impl(std::move(creationParams), dedicatedOnly); +} core::smart_refctd_ptr ILogicalDevice::compileShader(const SShaderCreationParameters& creationParams) { From f23b30c87eddbcd021f1f65c772eeb2db5684865 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Tue, 24 Mar 2026 15:40:13 +0700 Subject: [PATCH 31/83] Remove unnecessary inline modifier --- include/nbl/video/ILogicalDevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index d6d2f8530a..a3a9b264d0 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -331,7 +331,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); + core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) From e50c85e5f774ecf6289bc3fcc26f357f30327c4a Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:21:11 +0700 Subject: [PATCH 32/83] Remove unused code in CCUDADevice --- include/nbl/video/CCUDADevice.h | 115 -------------------------------- src/nbl/video/CCUDADevice.cpp | 110 ------------------------------ 2 files changed, 225 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 3d40ebff25..6b3ab2bbb6 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -85,121 +85,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; } - // TODO/REDO Vulkan: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vulkan-interoperability - // Watch out, use Driver API (`cu` functions) NOT the Runtime API (`cuda` functions) - // Also maybe separate this out into its own `CCUDA` class instead of nesting it here? -#if 0 - template - struct GraphicsAPIObjLink - { - GraphicsAPIObjLink() : obj(nullptr), cudaHandle(nullptr), acquired(false) - { - asImage = {nullptr}; - } - GraphicsAPIObjLink(core::smart_refctd_ptr&& _obj) : GraphicsAPIObjLink() - { - obj = std::move(_obj); - } - GraphicsAPIObjLink(GraphicsAPIObjLink&& other) : GraphicsAPIObjLink() - { - operator=(std::move(other)); - } - - GraphicsAPIObjLink(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(GraphicsAPIObjLink&& other) - { - std::swap(obj,other.obj); - std::swap(cudaHandle,other.cudaHandle); - std::swap(acquired,other.acquired); - std::swap(asImage,other.asImage); - return *this; - } - - ~GraphicsAPIObjLink() - { - assert(!acquired); // you've fucked up, there's no way for us to fix it, you need to release the objects on a proper stream - if (obj) - CCUDAHandler::cuda.pcuGraphicsUnregisterResource(cudaHandle); - } - - // - auto* getObject() const {return obj.get();} - - private: - core::smart_refctd_ptr obj; - CUgraphicsResource cudaHandle; - bool acquired; - - friend class CCUDAHandler; - public: - union - { - struct - { - CUdeviceptr pointer; - } asBuffer; - struct - { - CUmipmappedArray mipmappedArray; - CUarray array; - } asImage; - }; - }; - - // - static CUresult registerBuffer(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - static CUresult registerImage(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - - - template - static CUresult acquireResourcesFromGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsMapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = true; - return retval; - } - template - static CUresult releaseResourcesToGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsUnmapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = false; - return retval; - } - - static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); - static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, - - CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); -uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); -#endif - CUdevice getInternalObject() const { return m_handle; } const CCUDAHandler* getHandler() const { return m_handler.get(); } bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index ac25bb234a..5d1198bb0d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -48,116 +48,6 @@ CCUDADevice::CCUDADevice( } } - -#if 0 -CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink* link, uint32_t flags) -{ - assert(link->obj); - auto glbuf = static_cast(link->obj.get()); - auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags); - if (retval!=CUDA_SUCCESS) - link->obj = nullptr; - return retval; -} -CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink* link, uint32_t flags) -{ - assert(link->obj); - - auto format = link->obj->getCreationParameters().format; - if (asset::isBlockCompressionFormat(format) || asset::isDepthOrStencilFormat(format) || asset::isScaledFormat(format) || asset::isPlanarFormat(format)) - return CUDA_ERROR_INVALID_IMAGE; - - auto glimg = static_cast(link->obj.get()); - GLenum target = glimg->getOpenGLTarget(); - switch (target) - { - case GL_TEXTURE_2D: - case GL_TEXTURE_2D_ARRAY: - case GL_TEXTURE_CUBE_MAP: - case GL_TEXTURE_3D: - break; - default: - return CUDA_ERROR_INVALID_IMAGE; - break; - } - auto retval = cuda.pcuGraphicsGLRegisterImage(&link->cudaHandle,glimg->getOpenGLName(),target,flags); - if (retval != CUDA_SUCCESS) - link->obj = nullptr; - return retval; -} - - -constexpr auto MaxAquireOps = 4096u; - -CUresult CCUDAHandler::acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedPointer_v2(&iit->asBuffer.pointer,outbufferSizes ? sit:&tmp,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -CUresult CCUDAHandler::acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedMipmappedArray(&iit->asImage.mipmappedArray,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream) -{ - if (linksBegin+MaxAquireOpsacquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsSubResourceGetMappedArray(&iit->asImage.array,iit->cudaHandle,*ait,*mit); - if (result != CUDA_SUCCESS) - return result; - } - return CUDA_SUCCESS; -} -#endif - size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const { return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; From a9c2d85e192972d524e8382e17d13b229ffada58 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:22:04 +0700 Subject: [PATCH 33/83] Fix importSemaphore for unix --- src/nbl/video/CCUDADevice.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 5d1198bb0d..22421522f3 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -146,17 +146,17 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr Date: Wed, 25 Mar 2026 14:22:52 +0700 Subject: [PATCH 34/83] Remove searching for old nvrtc version --- src/nbl/video/CCUDAHandler.cpp | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index c1044dd894..770db41946 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -455,35 +455,9 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste // Perpetual TODO: any new CUDA releases we need to account for? // Version List: https://developer.nvidia.com/cuda-toolkit-archive const char* nvrtc64_versions[] = { + "nvrtc64_132", "nvrtc64_131", "nvrtc64_130", - "nvrtc64_129", - "nvrtc64_128", - "nvrtc64_126", - "nvrtc64_125", - "nvrtc64_124", - "nvrtc64_123", - "nvrtc64_122", - "nvrtc64_121", - "nvrtc64_120", - "nvrtc64_118", - "nvrtc64_117", - "nvrtc64_116", - "nvrtc64_115", - "nvrtc64_114", - "nvrtc64_113", - "nvrtc64_112", - "nvrtc64_111", - "nvrtc64_110", - "nvrtc64_102", - "nvrtc64_101", - "nvrtc64_100", - "nvrtc64_92", - "nvrtc64_91", - "nvrtc64_90", - "nvrtc64_80", - "nvrtc64_75", - "nvrtc64_70", nullptr }; @@ -523,7 +497,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste int cudaVersion = 0; SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) - if (cudaVersion<9000) + if (cudaVersion<13000) return nullptr; // stop the pollution From c244b77a7feffdec21441c4f75cdc3228f32a3b8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:23:50 +0700 Subject: [PATCH 35/83] Fix filling dstQueueFamilyIndex --- src/nbl/video/CVulkanCommandBuffer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index a04b5940ce..40b20bb5d2 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -90,10 +90,10 @@ void fill(vk_barrier_t& out, const ResourceBarrier& in, uint32_t selfQueueFamily switch (in.ownershipOp) { case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE: - out.dstQueueFamilyIndex = in.otherQueueFamilyIndex; + out.dstQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE: - out.srcQueueFamilyIndex = in.otherQueueFamilyIndex; + out.srcQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; } } From d24acf9fd6181a60d1f22d92da3dc434e1c4aceb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 25 Mar 2026 14:26:12 +0700 Subject: [PATCH 36/83] Update cuda toolkit requirement in cmake --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0068b002a..5be1855959 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,10 +75,10 @@ option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" ON) if(NBL_COMPILE_WITH_CUDA) find_package(CUDAToolkit REQUIRED) - if(${CUDAToolkit_VERSION} VERSION_GREATER "9.0") - message(STATUS "CUDA version 9.0+ found!") + if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") + message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") else() - message(FATAL_ERROR "CUDA version 9.0+ needed for C++14 support!") + message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!") endif() endif() From ff828003da1f7327af4c399fc3dd0d0bd6d22013 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 15 Apr 2026 17:29:38 +0700 Subject: [PATCH 37/83] Improve external semaphore handle management - Separate cached creation params from creation params in ISemaphore - Move external handle storage to backend-specific classes (e.g., CVulkanSemaphore) - Add virtual getExternalHandle() to ISemaphore interface - Update Vulkan semaphore creation to retrieve and store external handle after creation - Ensure proper cleanup of external handles in CVulkanSemaphore destructor --- include/nbl/video/CCUDASharedSemaphore.h | 2 -- include/nbl/video/ISemaphore.h | 24 +++++++++----------- src/nbl/video/CCUDADevice.cpp | 11 +++++---- src/nbl/video/CCUDASharedSemaphore.cpp | 4 ++-- src/nbl/video/CVulkanLogicalDevice.cpp | 29 ++++++++++++++++++------ src/nbl/video/CVulkanSemaphore.cpp | 16 +++++++++---- src/nbl/video/CVulkanSemaphore.h | 6 +++-- 7 files changed, 56 insertions(+), 36 deletions(-) diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h index 6c69f75438..8a3a73d0b4 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -35,14 +35,12 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) - , m_osHandle(osHandle) {} ~CCUDASharedSemaphore() override; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_src; CUexternalSemaphore m_handle; - ExternalHandleType m_osHandle; }; } diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 67a093f9d3..59886b32cb 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -28,21 +28,14 @@ class ISemaphore : public IBackendObject }; //! - struct SCreationParams + struct SCachedCreationParams { - // A Pre-Destroy-Step is called out just before a `vkDestory` or `glDelete`, this is only useful for "imported" resources - std::unique_ptr preDestroyCleanup = nullptr; - // A Post-Destroy-Step is called in this class' destructor, this is only useful for "imported" resources - std::unique_ptr postDestroyCleanup = nullptr; - // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects - bool skipHandleDestroy = false; // Handle Type for external resources core::bitflag externalHandleTypes = EHT_NONE; - //! Imports the given handle if externalHandle != nullptr && externalMemoryHandleType != EHT_NONE - //! Creates exportable memory if externalHandle == nullptr && externalMemoryHandleType != EHT_NONE - ExternalHandleType externalHandle = nullptr; }; + struct SCreationParams : SCachedCreationParams {}; + // basically a pool function virtual uint64_t getCounterValue() const = 0; @@ -174,13 +167,18 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; - const SCreationParams& getCreationParams() const { return m_creationParams; } + virtual ExternalHandleType getExternalHandle() const = 0; + + const SCachedCreationParams& getCreationParams() const { return m_creationParams; } + + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} + inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : + IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} virtual ~ISemaphore() = default; - SCreationParams m_creationParams; + SCachedCreationParams m_creationParams; }; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 22421522f3..a5dcb52d8a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -141,18 +141,18 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptrgetCUDAFunctionTable(); auto handleType = sema->getCreationParams().externalHandleTypes.value; - auto handle = sema->getCreationParams().externalHandle; - if (!handleType || !handle) + if (!handleType) return CUDA_ERROR_INVALID_VALUE; CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { #ifdef _WIN32 .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, - .handle = {.win32 = {.handle = handle }}, + // TODO(kevinyu): Fix this later. Make it compile first. + .handle = {.win32 = {.handle = sema->getExternalHandle() }}, #else .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, - .handle = {.fd = handle} + .handle = {.fd = sema->getExternalHandle()} #endif }; @@ -161,7 +161,8 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, handle), core::dont_grab); + // TODO(kevinyu): Fix the handle parameter later. Make it compile first. + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, {}), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp index 049f93ac13..ae2291035a 100644 --- a/src/nbl/video/CCUDASharedSemaphore.cpp +++ b/src/nbl/video/CCUDASharedSemaphore.cpp @@ -11,8 +11,8 @@ namespace nbl::video CCUDASharedSemaphore::~CCUDASharedSemaphore() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - cu.pcuDestroyExternalSemaphore(m_handle); - CloseHandle(m_osHandle); + if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) + assert(!"Invalid code path."); } } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index c0df8fd9f4..66a0198402 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -62,7 +62,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u // TODO(kevin) : Handle importing external semaphore into Vulkan // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; - VkExportSemaphoreCreateInfo exportInfo = { + VkExportSemaphoreCreateInfo exportInfo = { VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, nullptr, static_cast(creationParams.externalHandleTypes.value) @@ -80,22 +80,37 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; - if (creationParams.externalHandleTypes.value) + ExternalHandleType externalHandle = ExternalHandleType{}; + const auto handleType = static_cast(creationParams.externalHandleTypes.value); + if (handleType != 0) { +#ifdef _WIN32 VkSemaphoreGetWin32HandleInfoKHR props = { .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, .semaphore = semaphore, - .handleType = static_cast(creationParams.externalHandleTypes.value), + .handleType = handleType, }; - if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &creationParams.externalHandle)) + + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &externalHandle)) { - m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, 0); + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); return nullptr; } +#else + VkSemaphoreGetFdInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .semaphore = vkSemaphore, + .handleType = handleType, + }; + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreFdKHR(m_vkdev, &props, &externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); + return nullptr; + } +#endif } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore); - + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore, externalHandle); } ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 792d1f27f1..958849dae2 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -7,12 +7,18 @@ namespace nbl::video CVulkanSemaphore::~CVulkanSemaphore() { - m_creationParams.preDestroyCleanup = nullptr; - if (!m_creationParams.skipHandleDestroy) + const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); + auto* vk = vulkanDevice->getFunctionTable(); + vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + if (m_creationParams.externalHandleTypes != EHT_NONE) { - const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - auto* vk = vulkanDevice->getFunctionTable(); - vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); +#ifdef _WIN32 + if (!CloseHandle(m_externalHandle)) + assert(!"Invalid code path."); +#else + if (close(m_externalHandle) != 0) + assert(!"Invalid code path."); +#endif } } diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index cc5d15d3f4..3fd4cb82dc 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,8 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const ExternalHandleType externalHandle) + : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore), m_externalHandle(externalHandle) {} ~CVulkanSemaphore(); uint64_t getCounterValue() const override; @@ -24,11 +24,13 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} + ExternalHandleType getExternalHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; private: const VkSemaphore m_semaphore; + const ExternalHandleType m_externalHandle; }; } From 7b486059444fc4146986a0ef5a702fe54202f384 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 15 Apr 2026 18:13:35 +0700 Subject: [PATCH 38/83] Improve win32HandleMetadata parameter so it is more readable --- src/nbl/video/CCUDADevice.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index a5dcb52d8a..9bb5e739f5 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -3,6 +3,8 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDADevice.h" +#include + #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { @@ -95,13 +97,14 @@ CUresult CCUDADevice::createSharedMemory( auto& cu = m_handler->getCUDAFunctionTable(); - uint32_t metaData[16] = { 48 }; + OBJECT_ATTRIBUTES metadata = {}; + metadata.Length = sizeof(OBJECT_ATTRIBUTES); CUmemAllocationProp prop = { .type = CU_MEM_ALLOCATION_TYPE_PINNED, .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, .location = { .type = params.location, .id = m_handle }, - .win32HandleMetaData = metaData, + .win32HandleMetaData = &metadata, }; params.granularSize = roundToGranularity(params.location, params.size); From 24ba36e6736cce8ebc6a87a9eef140b625b31bc7 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 16 Apr 2026 10:46:17 +0700 Subject: [PATCH 39/83] Refactor CCUDASharedMemory to use ExternalHandleType --- include/nbl/video/CCUDASharedMemory.h | 6 +----- src/nbl/video/CCUDADevice.cpp | 6 +++--- src/nbl/video/CCUDASharedMemory.cpp | 4 ++-- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index f133dadd81..15de1b72c4 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -42,11 +42,7 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted { size_t granularSize; CUdeviceptr ptr; - union - { - void* osHandle; - int fd; - }; + ExternalHandleType externalHandle; }; const SCreationParams& getCreationParams() const { return m_params; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 9bb5e739f5..738913e709 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -113,7 +113,7 @@ CUresult CCUDADevice::createSharedMemory( if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemExportToShareableHandle(¶ms.osHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) + if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { cu.pcuMemRelease(mem); return err; @@ -121,14 +121,14 @@ CUresult CCUDADevice::createSharedMemory( if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - CloseHandle(params.osHandle); + CloseHandle(params.externalHandle); cu.pcuMemRelease(mem); return err; } if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - CloseHandle(params.osHandle); + CloseHandle(params.externalHandle); return err; } diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 93ab6f4c48..2e58fa4756 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -35,7 +35,7 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsMemor dedication, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - m_params.osHandle, + m_params.externalHandle, std::make_unique(core::smart_refctd_ptr(this))).memory; } @@ -97,7 +97,7 @@ CCUDASharedMemory::~CCUDASharedMemory() CUresult re[] = { cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), }; - CloseHandle(m_params.osHandle); + CloseHandle(m_params.externalHandle); } } From 5b4fc27391b035a0f0ff3d5a22ad1c3d3768ed02 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 17 Apr 2026 14:33:23 +0700 Subject: [PATCH 40/83] Refactor ExternalHandleType --- include/nbl/video/CCUDASharedMemory.h | 2 +- include/nbl/video/CCUDASharedSemaphore.h | 2 +- include/nbl/video/EApiType.h | 32 ++++++++++++++++++++- include/nbl/video/IDeviceMemoryAllocation.h | 2 +- include/nbl/video/IDeviceMemoryAllocator.h | 8 +++--- include/nbl/video/ISemaphore.h | 2 +- src/nbl/video/CCUDADevice.cpp | 4 +-- src/nbl/video/CCUDASharedMemory.cpp | 2 +- src/nbl/video/CVulkanLogicalDevice.cpp | 25 +++------------- src/nbl/video/CVulkanMemoryAllocation.cpp | 2 +- src/nbl/video/CVulkanSemaphore.cpp | 8 +----- src/nbl/video/CVulkanSemaphore.h | 6 ++-- 12 files changed, 51 insertions(+), 44 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 15de1b72c4..20902ac90b 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -42,7 +42,7 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted { size_t granularSize; CUdeviceptr ptr; - ExternalHandleType externalHandle; + external_handle_t externalHandle; }; const SCreationParams& getCreationParams() const { return m_params; } diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h index 8a3a73d0b4..60daec7159 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -31,7 +31,7 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted CCUDASharedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, CUexternalSemaphore semaphore, - ExternalHandleType osHandle) + external_handle_t osHandle) : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 3e86c8d040..0726049200 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,7 +13,7 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; -using ExternalHandleType = +using external_handle_t = #ifdef _WIN32 void* #else @@ -21,6 +21,36 @@ int #endif ; +#ifdef _WIN32 +constexpr external_handle_t ExternalHandleNull = nullptr; +#else +constexpr external_handle_t ExternalHandleNull = -1; +#endif + +inline bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return (close(handle) == 0); +#endif +} + +inline external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE re = ExternalHandleNull; + + const HANDLE cur = GetCurrentProcess(); + if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return re; +#else + return dup(handle); +#endif +} + } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 8de6bd4fa8..cd15039203 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -176,7 +176,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE - ExternalHandleType externalHandle = 0; + external_handle_t externalHandle = 0; }; struct SCreationParams: SInfo diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 9201d3f849..8fc07dd698 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -46,7 +46,7 @@ class NBL_API2 IDeviceMemoryAllocator IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, - ExternalHandleType handle) : + external_handle_t handle) : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs), m_handleType(handleType), @@ -83,7 +83,7 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; - ExternalHandleType m_handle; + external_handle_t m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB @@ -94,7 +94,7 @@ class NBL_API2 IDeviceMemoryAllocator const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, - ExternalHandleType handle + external_handle_t handle ) : IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { @@ -125,7 +125,7 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked* dedication = nullptr, const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, - ExternalHandleType externalHandle = {}, + external_handle_t externalHandle = {}, std::unique_ptr&& postDestroyCleanup = nullptr) { for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index 59886b32cb..0edc906b5d 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -167,7 +167,7 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; - virtual ExternalHandleType getExternalHandle() const = 0; + virtual external_handle_t getExternalHandle() const = 0; const SCachedCreationParams& getCreationParams() const { return m_creationParams; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 738913e709..9e572fe119 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -121,14 +121,14 @@ CUresult CCUDADevice::createSharedMemory( if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - CloseHandle(params.externalHandle); + CloseExternalHandle(params.externalHandle); cu.pcuMemRelease(mem); return err; } if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - CloseHandle(params.externalHandle); + CloseExternalHandle(params.externalHandle); return err; } diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 2e58fa4756..22a5ea858a 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -97,7 +97,7 @@ CCUDASharedMemory::~CCUDASharedMemory() CUresult re[] = { cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), }; - CloseHandle(m_params.externalHandle); + CloseExternalHandle(m_params.externalHandle); } } diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 66a0198402..5b1e2ec981 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -80,7 +80,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; - ExternalHandleType externalHandle = ExternalHandleType{}; + external_handle_t externalHandle = external_handle_t{}; const auto handleType = static_cast(creationParams.externalHandleTypes.value); if (handleType != 0) { @@ -177,23 +177,6 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } -ExternalHandleType DupeHandle(uint64_t pid, ExternalHandleType handle) -{ -#ifdef _WIN32 - HANDLE re = 0; - - HANDLE cur = GetCurrentProcess(); - HANDLE src = pid ? OpenProcess(GENERIC_ALL, false, pid) : cur; - - if (!DuplicateHandle(src, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) - return 0; - - CloseHandle(src); - return re; -#endif - return handle; -} - IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) @@ -240,8 +223,8 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca { if (info.externalHandle) //importing { - auto duped = DupeHandle(0, info.externalHandle); - const_cast(info.externalHandle) = duped; + auto duped = DuplicateExternalHandle(info.externalHandle); + const_cast(info.externalHandle) = duped; *pNext = &importInfo; } else // exporting @@ -312,7 +295,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca #else vkGetMemoryFdKHR #endif - (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) { m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); return {}; diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index c817213700..f2194756f9 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -11,7 +11,7 @@ CVulkanMemoryAllocation::CVulkanMemoryAllocation( CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { - if (m_params.externalHandle) + if (m_params.externalHandle != ExternalHandleNull) { bool re = CloseHandle(getCreationParams().externalHandle); assert(re); diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 958849dae2..35aefa6ebd 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -12,13 +12,7 @@ CVulkanSemaphore::~CVulkanSemaphore() vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); if (m_creationParams.externalHandleTypes != EHT_NONE) { -#ifdef _WIN32 - if (!CloseHandle(m_externalHandle)) - assert(!"Invalid code path."); -#else - if (close(m_externalHandle) != 0) - assert(!"Invalid code path."); -#endif + CloseExternalHandle(m_externalHandle); } } diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 3fd4cb82dc..12ba147a24 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,7 +15,7 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const ExternalHandleType externalHandle) + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const external_handle_t externalHandle) : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore), m_externalHandle(externalHandle) {} ~CVulkanSemaphore(); @@ -24,13 +24,13 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} - ExternalHandleType getExternalHandle() const override { return m_externalHandle; } + external_handle_t getExternalHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; private: const VkSemaphore m_semaphore; - const ExternalHandleType m_externalHandle; + const external_handle_t m_externalHandle; }; } From fb66f3a83d8a8190258f1559f7baf9345335f3a7 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 17 Apr 2026 14:35:22 +0700 Subject: [PATCH 41/83] Small fix to use CloseExternalHandle --- src/nbl/video/CVulkanMemoryAllocation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index f2194756f9..f2d64eceed 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -13,7 +13,7 @@ CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { if (m_params.externalHandle != ExternalHandleNull) { - bool re = CloseHandle(getCreationParams().externalHandle); + bool re = CloseExternalHandle(getCreationParams().externalHandle); assert(re); } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); From 47ba7e4b6b36448a49bc2d5f5962ce423e8a5026 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:20:15 +0700 Subject: [PATCH 42/83] Remove CCUDASharedMemory::exportAsImage --- include/nbl/video/CCUDASharedMemory.h | 2 -- src/nbl/video/CCUDASharedMemory.cpp | 16 ---------------- 2 files changed, 18 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 20902ac90b..2ce4a8067e 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -49,8 +49,6 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - core::smart_refctd_ptr exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const; - protected: CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index 22a5ea858a..a5f79a0c72 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -73,22 +73,6 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDev #endif -core::smart_refctd_ptr CCUDASharedMemory::exportAsImage(ILogicalDevice* device, asset::IImage::SCreationParams&& params) const -{ - if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) - return nullptr; - - // auto img = device->createImage({ - // std::move(params), {{ .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE }}, - // IGPUImage::ET_LINEAR, - // IGPUImage::EL_PREINITIALIZED, - // }); - // - // if (exportAsMemory(device, img.get())) - // return img; - - return nullptr; -} CCUDASharedMemory::~CCUDASharedMemory() { From d15d00c58564633352ab6d484e24fef50579c809 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:20:41 +0700 Subject: [PATCH 43/83] Remove unused CCUDASharedMemory::exportAsBuffer --- src/nbl/video/CCUDASharedMemory.cpp | 35 ----------------------------- 1 file changed, 35 deletions(-) diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDASharedMemory.cpp index a5f79a0c72..34560e5575 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDASharedMemory.cpp @@ -39,41 +39,6 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsMemor std::make_unique(core::smart_refctd_ptr(this))).memory; } -#if 0 -core::smart_refctd_ptr CCUDASharedMemory::exportAsBuffer(ILogicalDevice* device, core::bitflag usage) const -{ - if (!device || !m_device->isMatchingDevice(device->getPhysicalDevice())) - return nullptr; - - auto buf = device->createBuffer({{ - .size = m_params.granularSize, - .usage = usage }, {{ - .postDestroyCleanup = std::make_unique(core::smart_refctd_ptr(this)), - .externalHandleTypes = CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - .externalHandle = m_params.osHandle - }}}); - - auto req = buf->getMemoryReqs(); - auto pd = device->getPhysicalDevice(); - switch (m_params.location) - { - case CU_MEM_LOCATION_TYPE_DEVICE: req.memoryTypeBits &= pd->getDeviceLocalMemoryTypeBits(); break; - case CU_MEM_LOCATION_TYPE_HOST: req.memoryTypeBits &= pd->getHostVisibleMemoryTypeBits(); break; - // TODO(Atil): Figure out how to handle these - case CU_MEM_LOCATION_TYPE_HOST_NUMA: - case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: - default: break; - } - - if (!device->allocate(req, buf.get()).isValid()) - return nullptr; - - return buf; -} - -#endif - - CCUDASharedMemory::~CCUDASharedMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); From ea361894c249ef8b41cf153c8e5eb6de223bba73 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:37:52 +0700 Subject: [PATCH 44/83] Refactor external memory allocation to store the external handle separated from imported handle --- include/nbl/video/CCUDASharedMemory.h | 1 + include/nbl/video/IDeviceMemoryAllocation.h | 2 ++ src/nbl/video/CVulkanLogicalDevice.cpp | 10 +++++----- src/nbl/video/CVulkanMemoryAllocation.cpp | 7 ++++--- src/nbl/video/CVulkanMemoryAllocation.h | 7 +++++++ 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDASharedMemory.h index 2ce4a8067e..35965e5370 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDASharedMemory.h @@ -58,6 +58,7 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted ~CCUDASharedMemory() override; core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_allocation; SCachedCreationParams m_params; }; diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index cd15039203..52b541ceb5 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -187,6 +187,8 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted inline const SCreationParams& getCreationParams() const { return m_params; } + virtual external_handle_t getExternalHandle() const = 0; + protected: inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) { diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 5b1e2ec981..c22cfe93b9 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -194,7 +194,6 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca VkImportMemoryWin32HandleInfoKHR importInfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, .handleType = static_cast(info.externalHandleType), - .handle = info.externalHandle }; VkExportMemoryWin32HandleInfoKHR handleInfo = { @@ -219,12 +218,13 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca const void** pNext = &vk_allocateFlagsInfo.pNext; + external_handle_t externalHandle = ExternalHandleNull; if (info.externalHandleType) { if (info.externalHandle) //importing { - auto duped = DuplicateExternalHandle(info.externalHandle); - const_cast(info.externalHandle) = duped; + externalHandle = DuplicateExternalHandle(info.externalHandle); + importInfo.handle = externalHandle; *pNext = &importInfo; } else // exporting @@ -295,7 +295,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca #else vkGetMemoryFdKHR #endif - (m_vkdev, &handleInfo, const_cast(&info.externalHandle))) + (m_vkdev, &handleInfo, &externalHandle)) { m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); return {}; @@ -307,7 +307,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; IDeviceMemoryAllocator::SAllocation ret = {}; - ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, std::move(params)); + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, externalHandle, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) { diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index f2d64eceed..0ec6fc351d 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -6,14 +6,15 @@ namespace nbl::video CVulkanMemoryAllocation::CVulkanMemoryAllocation( const CVulkanLogicalDevice* dev, const VkDeviceMemory deviceMemoryHandle, + const external_handle_t externalHandle, SCreationParams&& params -) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} +) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle), m_externalHandle(externalHandle) {} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { - if (m_params.externalHandle != ExternalHandleNull) + if (m_externalHandle != ExternalHandleNull) { - bool re = CloseExternalHandle(getCreationParams().externalHandle); + bool re = CloseExternalHandle(m_externalHandle); assert(re); } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 22e32142c0..473d826595 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -17,11 +17,17 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation CVulkanMemoryAllocation( const CVulkanLogicalDevice* dev, const VkDeviceMemory deviceMemoryHandle, + const external_handle_t externalHandle, SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } + inline external_handle_t getExternalHandle() const override + { + return m_externalHandle; + } + private: ~CVulkanMemoryAllocation(); @@ -30,6 +36,7 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation core::smart_refctd_ptr m_vulkanDevice; const VkDeviceMemory m_deviceMemoryHandle; + const external_handle_t m_externalHandle; }; } From f04dcdb03b1d74b1354340d3d499050ea30de2d1 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:38:13 +0700 Subject: [PATCH 45/83] Remove unused constructor parameter in CCUDASharedSemaphore --- include/nbl/video/CCUDASharedSemaphore.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDASharedSemaphore.h index 60daec7159..2277ea57cf 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDASharedSemaphore.h @@ -30,8 +30,7 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted CCUDASharedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, - CUexternalSemaphore semaphore, - external_handle_t osHandle) + CUexternalSemaphore semaphore) : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) From cea9d9e81f1ba9d5d2812bf68d258f27c20dccba Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 17:38:56 +0700 Subject: [PATCH 46/83] Implement CCUDAImportedMemory --- include/nbl/video/CCUDADevice.h | 3 ++ include/nbl/video/CCUDAImportedMemory.h | 42 +++++++++++++++++++++++++ src/nbl/CMakeLists.txt | 1 + src/nbl/video/CCUDADevice.cpp | 34 ++++++++++++++++++-- src/nbl/video/CCUDAImportedMemory.cpp | 33 +++++++++++++++++++ 5 files changed, 111 insertions(+), 2 deletions(-) create mode 100644 include/nbl/video/CCUDAImportedMemory.h create mode 100644 src/nbl/video/CCUDAImportedMemory.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 6b3ab2bbb6..a80bbbbd28 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -7,6 +7,7 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDASharedSemaphore.h" @@ -91,6 +92,8 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); protected: diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h new file mode 100644 index 0000000000..8fbbccb31b --- /dev/null +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -0,0 +1,42 @@ +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H +#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +#endif // _NBL_COMPILE_WITH_CUDA + +namespace nbl::video +{ + +class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted +{ + public: + friend class CCUDADevice; + + CUexternalMemory getInternalObject() const { return m_handle; } + CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); + + protected: + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, + CUexternalMemory cuExtMem) : + m_device(device), + m_src(src), + m_handle(cuExtMem) {} + + ~CCUDAImportedMemory() override; + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalMemory m_handle; + +}; + +} + +#endif \ No newline at end of file diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index bbec1b1691..eedfd514c6 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -296,6 +296,7 @@ set(NBL_VIDEO_SOURCES video/CCUDADevice.cpp video/CCUDASharedSemaphore.cpp video/CCUDASharedMemory.cpp + video/CCUDAImportedMemory.cpp ) set(NBL_SCENE_SOURCES diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 9e572fe119..535fb76d46 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -5,6 +5,8 @@ #include +#include "nbl/video/CCUDAImportedMemory.h" + #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { @@ -137,6 +139,35 @@ CUresult CCUDADevice::createSharedMemory( return CUDA_SUCCESS; } +CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) +{ + if (!mem || !outPtr) + return CUDA_ERROR_INVALID_VALUE; + + auto& cu = m_handler->getCUDAFunctionTable(); + auto handleType = mem->getCreationParams().externalHandleType; + + if (!handleType) return CUDA_ERROR_INVALID_VALUE; + + const auto externalHandle = mem->getExternalHandle(); + + CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; +#ifdef _WIN32 + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; + extMemDesc.handle.win32.handle = externalHandle; +#else + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD; + extMemDesc.handle.fd = externalHandle; +#endif + extMemDesc.size = mem->getAllocationSize(); + + CUexternalMemory cuExtMem; + if (auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + return err; + *outPtr = core::smart_refctd_ptr(new CCUDAImportedMemory(core::smart_refctd_ptr(this), core::smart_refctd_ptr(mem), cuExtMem), core::dont_grab); + return CUDA_SUCCESS; +} + CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) { if (!sema || !outPtr) @@ -164,8 +195,7 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema, {}), core::dont_grab); + *outPtr = core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp new file mode 100644 index 0000000000..33ba43eb28 --- /dev/null +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +namespace nbl::video +{ + +CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) +{ + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; + bufferDesc.offset = 0; + bufferDesc.size = m_src->getAllocationSize(); + + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc); + +} + +CCUDAImportedMemory::~CCUDAImportedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + if (cu.pcuDestroyExternalMemory(m_handle) != CUDA_SUCCESS) + assert(!"Invalid code path"); +} + +} + +#endif \ No newline at end of file From 3ea3e9d5ae485b58069f3284ed646a28d7ab071c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 18:07:38 +0700 Subject: [PATCH 47/83] Rename CCUDASharedSemaphore into CCUDAImportedSemaphore --- include/nbl/video/CCUDADevice.h | 4 ++-- ...edSemaphore.h => CCUDAImportedSemaphore.h} | 10 +++++----- src/nbl/CMakeLists.txt | 2 +- src/nbl/video/CCUDADevice.cpp | 4 ++-- src/nbl/video/CCUDAImportedSemaphore.cpp | 19 +++++++++++++++++++ 5 files changed, 29 insertions(+), 10 deletions(-) rename include/nbl/video/{CCUDASharedSemaphore.h => CCUDAImportedSemaphore.h} (76%) create mode 100644 src/nbl/video/CCUDAImportedSemaphore.cpp diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index a80bbbbd28..c7778af0be 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -8,7 +8,7 @@ #include "nbl/video/IPhysicalDevice.h" #include "nbl/video/CCUDASharedMemory.h" #include "nbl/video/CCUDAImportedMemory.h" -#include "nbl/video/CCUDASharedSemaphore.h" +#include "nbl/video/CCUDAImportedSemaphore.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -94,7 +94,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); - CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); protected: CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); diff --git a/include/nbl/video/CCUDASharedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h similarity index 76% rename from include/nbl/video/CCUDASharedSemaphore.h rename to include/nbl/video/CCUDAImportedSemaphore.h index 2277ea57cf..d5139a55c9 100644 --- a/include/nbl/video/CCUDASharedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -1,8 +1,8 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ -#define _NBL_VIDEO_C_CUDA_SHARED_SEMAPHORE_H_ +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -19,7 +19,7 @@ namespace nbl::video { -class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted +class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { public: friend class CCUDADevice; @@ -28,14 +28,14 @@ class NBL_API2 CCUDASharedSemaphore : public core::IReferenceCounted protected: - CCUDASharedSemaphore(core::smart_refctd_ptr device, + CCUDAImportedSemaphore(core::smart_refctd_ptr device, core::smart_refctd_ptr src, CUexternalSemaphore semaphore) : m_device(std::move(device)) , m_src(std::move(m_src)) , m_handle(semaphore) {} - ~CCUDASharedSemaphore() override; + ~CCUDAImportedSemaphore() override; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_src; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index eedfd514c6..52605112e5 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -294,7 +294,7 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp - video/CCUDASharedSemaphore.cpp + video/CCUDAImportedSemaphore.cpp video/CCUDASharedMemory.cpp video/CCUDAImportedMemory.cpp ) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 535fb76d46..30d4093fb1 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -168,7 +168,7 @@ CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) { if (!sema || !outPtr) return CUDA_ERROR_INVALID_VALUE; @@ -195,7 +195,7 @@ CUresult CCUDADevice::importGPUSemaphore(core::smart_refctd_ptr(new CCUDASharedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); + *outPtr = core::smart_refctd_ptr(new CCUDAImportedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp new file mode 100644 index 0000000000..69b851088e --- /dev/null +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAImportedSemaphore.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) + assert(!"Invalid code path."); +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file From 130cd1ef1124110f3a0a1f4b07e8d672c1a2e9e5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 18:11:13 +0700 Subject: [PATCH 48/83] Rename CCUDASharedMemory into CCUDAExportableMemory --- include/nbl/video/CCUDADevice.h | 4 ++-- .../{CCUDASharedMemory.h => CCUDAExportableMemory.h} | 10 +++++----- src/nbl/CMakeLists.txt | 2 +- src/nbl/video/CCUDADevice.cpp | 10 +++++----- ...CCUDASharedMemory.cpp => CCUDAExportableMemory.cpp} | 8 ++++---- 5 files changed, 17 insertions(+), 17 deletions(-) rename include/nbl/video/{CCUDASharedMemory.h => CCUDAExportableMemory.h} (82%) rename src/nbl/video/{CCUDASharedMemory.cpp => CCUDAExportableMemory.cpp} (83%) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index c7778af0be..869e84d691 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -6,7 +6,7 @@ #include "nbl/video/IPhysicalDevice.h" -#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDAImportedMemory.h" #include "nbl/video/CCUDAImportedSemaphore.h" @@ -90,7 +90,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const CCUDAHandler* getHandler() const { return m_handler.get(); } bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } size_t roundToGranularity(CUmemLocationType location, size_t size) const; - CUresult createSharedMemory(core::smart_refctd_ptr* outMem, struct CCUDASharedMemory::SCreationParams&& inParams); + CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); diff --git a/include/nbl/video/CCUDASharedMemory.h b/include/nbl/video/CCUDAExportableMemory.h similarity index 82% rename from include/nbl/video/CCUDASharedMemory.h rename to include/nbl/video/CCUDAExportableMemory.h index 35965e5370..8729c87338 100644 --- a/include/nbl/video/CCUDASharedMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -1,8 +1,8 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ -#define _NBL_VIDEO_C_CUDA_SHARED_MEMORY_H_ +#ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -24,7 +24,7 @@ class CCUDAMemoryMapping: public core::IReferenceCounted { }; -class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted +class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { public: friend class CCUDADevice; @@ -51,11 +51,11 @@ class NBL_API2 CCUDASharedMemory : public core::IReferenceCounted protected: - CCUDASharedMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) : m_device(std::move(device)) , m_params(std::move(params)) {} - ~CCUDASharedMemory() override; + ~CCUDAExportableMemory() override; core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_allocation; diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 52605112e5..692efec8bd 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -295,7 +295,7 @@ set(NBL_VIDEO_SOURCES video/CCUDAHandler.cpp video/CCUDADevice.cpp video/CCUDAImportedSemaphore.cpp - video/CCUDASharedMemory.cpp + video/CCUDAExportableMemory.cpp video/CCUDAImportedMemory.cpp ) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 30d4093fb1..bd54cce81e 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -88,14 +88,14 @@ CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } -CUresult CCUDADevice::createSharedMemory( - core::smart_refctd_ptr* outMem, - CCUDASharedMemory::SCreationParams&& inParams) +CUresult CCUDADevice::createExportableMemory( + core::smart_refctd_ptr* outMem, + CCUDAExportableMemory::SCreationParams&& inParams) { if (!outMem) return CUDA_ERROR_INVALID_VALUE; - CCUDASharedMemory::SCachedCreationParams params = { inParams }; + CCUDAExportableMemory::SCachedCreationParams params = { inParams }; auto& cu = m_handler->getCUDAFunctionTable(); @@ -134,7 +134,7 @@ CUresult CCUDADevice::createSharedMemory( return err; } - *outMem = core::smart_refctd_ptr(new CCUDASharedMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + *outMem = core::smart_refctd_ptr(new CCUDAExportableMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); return CUDA_SUCCESS; } diff --git a/src/nbl/video/CCUDASharedMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp similarity index 83% rename from src/nbl/video/CCUDASharedMemory.cpp rename to src/nbl/video/CCUDAExportableMemory.cpp index 34560e5575..bbe773f610 100644 --- a/src/nbl/video/CCUDASharedMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -2,14 +2,14 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#include "nbl/video/CCUDASharedMemory.h" +#include "nbl/video/CCUDAExportableMemory.h" #include "nbl/video/CCUDADevice.h" #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { -core::smart_refctd_ptr CCUDASharedMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const { auto pd = device->getPhysicalDevice(); uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; @@ -36,10 +36,10 @@ core::smart_refctd_ptr CCUDASharedMemory::exportAsMemor IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, m_params.externalHandle, - std::make_unique(core::smart_refctd_ptr(this))).memory; + std::make_unique(core::smart_refctd_ptr(this))).memory; } -CCUDASharedMemory::~CCUDASharedMemory() +CCUDAExportableMemory::~CCUDAExportableMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); From c624053e03598cfde6dac16660124506a17b6cb5 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Wed, 22 Apr 2026 18:54:18 +0700 Subject: [PATCH 49/83] Remove unused member in CCUDAExportableMemory --- include/nbl/video/CCUDAExportableMemory.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index 8729c87338..b4df99d9f5 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -58,7 +58,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted ~CCUDAExportableMemory() override; core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_allocation; SCachedCreationParams m_params; }; From 9127faa8215ad6fe9de8dabd7563d7d9263d7b6f Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 01:08:29 +0700 Subject: [PATCH 50/83] Slight rename to CCUDADevice method --- include/nbl/video/CCUDADevice.h | 4 ++-- src/nbl/video/CCUDADevice.cpp | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 869e84d691..89449a21f0 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -92,9 +92,9 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); - CUresult importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); + CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); - CUresult importGPUSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); protected: CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index bd54cce81e..3f933be988 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -139,11 +139,13 @@ CUresult CCUDADevice::createExportableMemory( return CUDA_SUCCESS; } -CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) +CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) { if (!mem || !outPtr) return CUDA_ERROR_INVALID_VALUE; + const auto memProperty = mem->getCreationParams().memoryPropertyFlags; + auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = mem->getCreationParams().externalHandleType; @@ -168,7 +170,7 @@ CUresult CCUDADevice::importGPUMemory(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) { if (!sema || !outPtr) return CUDA_ERROR_INVALID_VALUE; From 059d1d5fa195abd08280bd032bfb3cc7d574a08d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 01:18:29 +0700 Subject: [PATCH 51/83] Merge with master --- include/nbl/builtin/hlsl/tgmath/impl.hlsl | 63 ++++++++++++----------- include/nbl/system/to_string.h | 10 ++++ include/nbl/video/CCUDAHandler.h | 57 ++++++-------------- 3 files changed, 59 insertions(+), 71 deletions(-) diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl index 4d1a30c757..0c1dc2f458 100644 --- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl +++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl @@ -197,12 +197,12 @@ struct erf_helper(NBL_FP64_LITERAL(0.254829592)); + const FloatingPoint a2 = _static_cast(NBL_FP64_LITERAL(-0.284496736)); + const FloatingPoint a3 = _static_cast(NBL_FP64_LITERAL(1.421413741)); + const FloatingPoint a4 = _static_cast(NBL_FP64_LITERAL(-1.453152027)); + const FloatingPoint a5 = _static_cast(NBL_FP64_LITERAL(1.061405429)); + const FloatingPoint p = _static_cast(NBL_FP64_LITERAL(0.3275911)); FloatingPoint _sign = FloatingPoint(sign(_x)); FloatingPoint x = abs(_x); @@ -393,10 +393,10 @@ struct erf_helper static float16_t __call(float16_t _x) { // A&S approximation to 2.5x10-5 - const float16_t a1 = float16_t(0.3480242f); - const float16_t a2 = float16_t(-0.0958798f); - const float16_t a3 = float16_t(0.7478556f); - const float16_t p = float16_t(0.47047f); + const float16_t a1 = _static_cast(0.3480242f); + const float16_t a2 = _static_cast(-0.0958798f); + const float16_t a3 = _static_cast(0.7478556f); + const float16_t p = _static_cast(0.47047f); float16_t _sign = float16_t(sign(_x)); float16_t x = abs_helper::__call(_x); @@ -414,35 +414,36 @@ struct erfInv_helper(_x, FloatingPoint(NBL_FP64_LITERAL(-0.99999)), FloatingPoint(NBL_FP64_LITERAL(0.99999))); + // TODO: maybe need to replace `FloatingPoint(NBL_FP64_LITERAL` with `_static_cast(NBL_FP64_LITERAL` to make DXC shut up + FloatingPoint x = clamp(_x, _static_cast(NBL_FP64_LITERAL(-0.99999)), _static_cast(NBL_FP64_LITERAL(0.99999))); - FloatingPoint w = -log_helper::__call((FloatingPoint(NBL_FP64_LITERAL(1.0)) - x) * (FloatingPoint(NBL_FP64_LITERAL(1.0)) + x)); + FloatingPoint w = -log_helper::__call((_static_cast(NBL_FP64_LITERAL(1.0)) - x) * (_static_cast(NBL_FP64_LITERAL(1.0)) + x)); FloatingPoint p; if (w < 5.0) { - w -= FloatingPoint(NBL_FP64_LITERAL(2.5)); - p = FloatingPoint(NBL_FP64_LITERAL(2.81022636e-08)); - p = FloatingPoint(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00021858087)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00125372503)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00417768164)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.246640727)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.50140941)) + p * w; + w -= _static_cast(NBL_FP64_LITERAL(2.5)); + p = _static_cast(NBL_FP64_LITERAL(2.81022636e-08)); + p = _static_cast(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00021858087)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00125372503)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00417768164)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.246640727)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.50140941)) + p * w; } else { w = sqrt_helper::__call(w) - FloatingPoint(NBL_FP64_LITERAL(3.0)); - p = FloatingPoint(NBL_FP64_LITERAL(-0.000200214257)); - p = FloatingPoint(NBL_FP64_LITERAL(0.000100950558)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00134934322)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00367342844)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00573950773)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.0076224613)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00943887047)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.00167406)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(2.83297682)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.000200214257)); + p = _static_cast(NBL_FP64_LITERAL(0.000100950558)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00134934322)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00367342844)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00573950773)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.0076224613)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00943887047)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.00167406)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(2.83297682)) + p * w; } return p * x; } diff --git a/include/nbl/system/to_string.h b/include/nbl/system/to_string.h index 2a06ace5e5..1f8988566e 100644 --- a/include/nbl/system/to_string.h +++ b/include/nbl/system/to_string.h @@ -1,6 +1,7 @@ #ifndef _NBL_SYSTEM_TO_STRING_INCLUDED_ #define _NBL_SYSTEM_TO_STRING_INCLUDED_ +#include #include #include #include @@ -21,6 +22,15 @@ struct to_string_helper } }; +template +struct to_string_helper +{ + static std::string __call(const T& value) + { + return std::format("{}", value); + } +}; + template<> struct to_string_helper { diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 9de55914b5..01774b25d2 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,9 +16,9 @@ namespace nbl::video { -class NBL_API2 CCUDAHandler : public core::IReferenceCounted +class CCUDAHandler : public core::IReferenceCounted { - public: + public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); inline bool defaultHandleResult(CUresult result) { @@ -34,12 +34,12 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v4 + ,cuCtxCreate_v2 ,cuDevicePrimaryCtxRetain ,cuDevicePrimaryCtxRelease ,cuDevicePrimaryCtxSetFlags @@ -62,7 +62,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuDeviceGet ,cuDeviceGetAttribute ,cuDeviceGetLuid - ,cuDeviceGetUuid_v2 + ,cuDeviceGetUuid ,cuDeviceTotalMem_v2 ,cuDeviceGetName ,cuDriverGetVersion @@ -119,24 +119,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy - ,cuImportExternalMemory - ,cuDestroyExternalMemory - ,cuExternalMemoryGetMappedBuffer - ,cuMemUnmap - ,cuMemAddressFree - ,cuMemGetAllocationGranularity - ,cuMemAddressReserve - ,cuMemCreate - ,cuMemExportToShareableHandle - ,cuMemMap - ,cuMemRelease - ,cuMemSetAccess - ,cuMemImportFromShareableHandle - ,cuLaunchHostFunc - ,cuDestroyExternalSemaphore - ,cuImportExternalSemaphore - ,cuSignalExternalSemaphoresAsync - ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -175,25 +157,13 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::IFile::success_t bytesRead; + system::future bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.getBytesProcessed()); + source.resize(bytesRead.get()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } - struct SCUDADeviceInfo - { - CUdevice handle = {}; - CUuuid uuid = {}; - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - }; - - inline core::vector const& getAvailableDevices() const - { - return m_availableDevices; - } - // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -258,8 +228,16 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); - + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) + : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) + { + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + } ~CCUDAHandler() = default; // @@ -282,7 +260,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted NVRTC m_nvrtc; // - core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; From 2eb8fee018a1877cca265efaf929ea78bbeee440 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 01:37:55 +0700 Subject: [PATCH 52/83] Add option for _NBL_COMPILE_WITH_CUDA_ --- src/nbl/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 692efec8bd..4c2f0571dd 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -68,6 +68,7 @@ option(_NBL_COMPILE_WITH_GLI_LOADER_ "Compile with GLI Loader" ON) option(_NBL_COMPILE_WITH_GLI_WRITER_ "Compile with GLI Writer" ON) option(_NBL_COMPILE_WITH_GLTF_LOADER_ "Compile with GLTF Loader" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER option(_NBL_COMPILE_WITH_GLTF_WRITER_ "Compile with GLTF Writer" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER +option(_NBL_COMPILE_WITH_CUDA_ "Compile with CUDA" ON) set(_NBL_EG_PRFNT_LEVEL 0 CACHE STRING "EasterEgg Profanity Level") option(NBL_EXPLICIT_MODULE_LOAD_LOG "Enable Runtime logs for external dynamic module loading" OFF) @@ -95,9 +96,8 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (NBL_COMPILE_WITH_CUDA) +if (_NBL_COMPILE_WITH_CUDA_) message(STATUS "Building with CUDA interop") - set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) if (NBL_BUILD_OPTIX) set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() @@ -425,7 +425,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() -if(NBL_COMPILE_WITH_CUDA) +if(_NBL_COMPILE_WITH_CUDA_) target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) endif() @@ -665,7 +665,7 @@ target_link_libraries(Nabla PRIVATE volk) target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) # CUDA -if (NBL_COMPILE_WITH_CUDA) +if (_NBL_COMPILE_WITH_CUDA_) list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") endif() From 6605bebf70742dbd64530b8cfeb93e911c5850fc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 13:42:49 +0700 Subject: [PATCH 53/83] Revert to correct state before merging with master --- include/nbl/builtin/hlsl/tgmath/impl.hlsl | 63 +++++++++++------------ include/nbl/video/CCUDAHandler.h | 57 ++++++++++++++------ 2 files changed, 71 insertions(+), 49 deletions(-) diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl index 0c1dc2f458..4d1a30c757 100644 --- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl +++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl @@ -197,12 +197,12 @@ struct erf_helper(NBL_FP64_LITERAL(0.254829592)); - const FloatingPoint a2 = _static_cast(NBL_FP64_LITERAL(-0.284496736)); - const FloatingPoint a3 = _static_cast(NBL_FP64_LITERAL(1.421413741)); - const FloatingPoint a4 = _static_cast(NBL_FP64_LITERAL(-1.453152027)); - const FloatingPoint a5 = _static_cast(NBL_FP64_LITERAL(1.061405429)); - const FloatingPoint p = _static_cast(NBL_FP64_LITERAL(0.3275911)); + const FloatingPoint a1 = FloatingPoint(NBL_FP64_LITERAL(0.254829592)); + const FloatingPoint a2 = FloatingPoint(NBL_FP64_LITERAL(-0.284496736)); + const FloatingPoint a3 = FloatingPoint(NBL_FP64_LITERAL(1.421413741)); + const FloatingPoint a4 = FloatingPoint(NBL_FP64_LITERAL(-1.453152027)); + const FloatingPoint a5 = FloatingPoint(NBL_FP64_LITERAL(1.061405429)); + const FloatingPoint p = FloatingPoint(NBL_FP64_LITERAL(0.3275911)); FloatingPoint _sign = FloatingPoint(sign(_x)); FloatingPoint x = abs(_x); @@ -393,10 +393,10 @@ struct erf_helper static float16_t __call(float16_t _x) { // A&S approximation to 2.5x10-5 - const float16_t a1 = _static_cast(0.3480242f); - const float16_t a2 = _static_cast(-0.0958798f); - const float16_t a3 = _static_cast(0.7478556f); - const float16_t p = _static_cast(0.47047f); + const float16_t a1 = float16_t(0.3480242f); + const float16_t a2 = float16_t(-0.0958798f); + const float16_t a3 = float16_t(0.7478556f); + const float16_t p = float16_t(0.47047f); float16_t _sign = float16_t(sign(_x)); float16_t x = abs_helper::__call(_x); @@ -414,36 +414,35 @@ struct erfInv_helper(NBL_FP64_LITERAL` to make DXC shut up - FloatingPoint x = clamp(_x, _static_cast(NBL_FP64_LITERAL(-0.99999)), _static_cast(NBL_FP64_LITERAL(0.99999))); + FloatingPoint x = clamp(_x, FloatingPoint(NBL_FP64_LITERAL(-0.99999)), FloatingPoint(NBL_FP64_LITERAL(0.99999))); - FloatingPoint w = -log_helper::__call((_static_cast(NBL_FP64_LITERAL(1.0)) - x) * (_static_cast(NBL_FP64_LITERAL(1.0)) + x)); + FloatingPoint w = -log_helper::__call((FloatingPoint(NBL_FP64_LITERAL(1.0)) - x) * (FloatingPoint(NBL_FP64_LITERAL(1.0)) + x)); FloatingPoint p; if (w < 5.0) { - w -= _static_cast(NBL_FP64_LITERAL(2.5)); - p = _static_cast(NBL_FP64_LITERAL(2.81022636e-08)); - p = _static_cast(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00021858087)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.00125372503)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.00417768164)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.246640727)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(1.50140941)) + p * w; + w -= FloatingPoint(NBL_FP64_LITERAL(2.5)); + p = FloatingPoint(NBL_FP64_LITERAL(2.81022636e-08)); + p = FloatingPoint(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00021858087)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.00125372503)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.00417768164)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.246640727)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(1.50140941)) + p * w; } else { w = sqrt_helper::__call(w) - FloatingPoint(NBL_FP64_LITERAL(3.0)); - p = _static_cast(NBL_FP64_LITERAL(-0.000200214257)); - p = _static_cast(NBL_FP64_LITERAL(0.000100950558)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00134934322)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.00367342844)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00573950773)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(-0.0076224613)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(0.00943887047)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(1.00167406)) + p * w; - p = _static_cast(NBL_FP64_LITERAL(2.83297682)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.000200214257)); + p = FloatingPoint(NBL_FP64_LITERAL(0.000100950558)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00134934322)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.00367342844)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00573950773)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(-0.0076224613)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(0.00943887047)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(1.00167406)) + p * w; + p = FloatingPoint(NBL_FP64_LITERAL(2.83297682)) + p * w; } return p * x; } diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..9de55914b5 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,9 +16,9 @@ namespace nbl::video { -class CCUDAHandler : public core::IReferenceCounted +class NBL_API2 CCUDAHandler : public core::IReferenceCounted { - public: + public: static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); inline bool defaultHandleResult(CUresult result) { @@ -34,12 +34,12 @@ class CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v2 + ,cuCtxCreate_v4 ,cuDevicePrimaryCtxRetain ,cuDevicePrimaryCtxRelease ,cuDevicePrimaryCtxSetFlags @@ -62,7 +62,7 @@ class CCUDAHandler : public core::IReferenceCounted ,cuDeviceGet ,cuDeviceGetAttribute ,cuDeviceGetLuid - ,cuDeviceGetUuid + ,cuDeviceGetUuid_v2 ,cuDeviceTotalMem_v2 ,cuDeviceGetName ,cuDriverGetVersion @@ -119,6 +119,24 @@ class CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -157,13 +175,25 @@ class CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::future bytesRead; + system::IFile::success_t bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); + source.resize(bytesRead.getBytesProcessed()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } + struct SCUDADeviceInfo + { + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + }; + + inline core::vector const& getAvailableDevices() const + { + return m_availableDevices; + } + // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -228,16 +258,8 @@ class CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + ~CCUDAHandler() = default; // @@ -260,6 +282,7 @@ class CCUDAHandler : public core::IReferenceCounted NVRTC m_nvrtc; // + core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; From af35f4f24df3ce4a670c9addebccb3429fa2ff8c Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 13:43:59 +0700 Subject: [PATCH 54/83] Revert "Add option for _NBL_COMPILE_WITH_CUDA_" This reverts commit 2eb8fee018a1877cca265efaf929ea78bbeee440. --- src/nbl/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 4c2f0571dd..692efec8bd 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -68,7 +68,6 @@ option(_NBL_COMPILE_WITH_GLI_LOADER_ "Compile with GLI Loader" ON) option(_NBL_COMPILE_WITH_GLI_WRITER_ "Compile with GLI Writer" ON) option(_NBL_COMPILE_WITH_GLTF_LOADER_ "Compile with GLTF Loader" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER option(_NBL_COMPILE_WITH_GLTF_WRITER_ "Compile with GLTF Writer" OFF) # TMP OFF COMPILE ERRORS ON V143 ON MASTER -option(_NBL_COMPILE_WITH_CUDA_ "Compile with CUDA" ON) set(_NBL_EG_PRFNT_LEVEL 0 CACHE STRING "EasterEgg Profanity Level") option(NBL_EXPLICIT_MODULE_LOAD_LOG "Enable Runtime logs for external dynamic module loading" OFF) @@ -96,8 +95,9 @@ configure_file("${NBL_ROOT_PATH}/include/nbl/config/BuildConfigOptions.h.in" "${ file(GENERATE OUTPUT "${CONFIG_OUTPUT}" INPUT "${CONFIG_DIRECOTORY}/.int/BuildConfigOptions.h.conf") nbl_install_file_spec("${CONFIG_OUTPUT}" nbl/config) -if (_NBL_COMPILE_WITH_CUDA_) +if (NBL_COMPILE_WITH_CUDA) message(STATUS "Building with CUDA interop") + set(_NBL_COMPILE_WITH_CUDA_ ${NBL_COMPILE_WITH_CUDA}) if (NBL_BUILD_OPTIX) set(_NBL_BUILD_OPTIX_ ${NBL_BUILD_OPTIX}) endif() @@ -425,7 +425,7 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() -if(_NBL_COMPILE_WITH_CUDA_) +if(NBL_COMPILE_WITH_CUDA) target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) endif() @@ -665,7 +665,7 @@ target_link_libraries(Nabla PRIVATE volk) target_compile_definitions(Nabla PUBLIC $<$:VK_USE_PLATFORM_WIN32_KHR>) # CUDA -if (_NBL_COMPILE_WITH_CUDA_) +if (NBL_COMPILE_WITH_CUDA) list(APPEND PUBLIC_BUILD_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}") endif() From 2479fb2e64bc9704779e10d496d0a71ef7bb7846 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 17:55:44 +0700 Subject: [PATCH 55/83] Slight fix --- src/nbl/video/CVulkanDeviceMemoryBacked.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 39c0efae19..955885b7ae 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked Date: Thu, 23 Apr 2026 17:59:08 +0700 Subject: [PATCH 56/83] Slight fix on linux handle --- src/nbl/video/CVulkanLogicalDevice.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index c22cfe93b9..74e8be47bf 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -204,7 +204,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca VkImportMemoryFdInfoKHR importInfo = { .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, .handleType = static_cast(info.externalHandleType), - .fd = (int)info.externalHandle, + .fd = info.externalHandle, }; #endif @@ -224,7 +224,11 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca if (info.externalHandle) //importing { externalHandle = DuplicateExternalHandle(info.externalHandle); +#ifdef _WIN32 importInfo.handle = externalHandle; +#else + importInfo.fd = externalHandle; +#endif *pNext = &importInfo; } else // exporting From 3df125b804a5147cdf78229d2ded029a64825d5d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:00:59 +0700 Subject: [PATCH 57/83] Fix typo --- include/nbl/asset/IBuffer.h | 2 +- include/nbl/video/IPhysicalDevice.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index 92ffd3eb4d..99f85e0b72 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -43,7 +43,7 @@ class IBuffer : public IDescriptor, public core::IBuffer // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, - EUF_SYNTHEHIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ + EUF_SYNTHETIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index f8550debce..e3cfe15a90 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -661,7 +661,7 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const { - usages &= ~asset::IBuffer::EUF_SYNTHEHIC_FLAGS_MASK; // mask out synthetic flags + usages &= ~asset::IBuffer::EUF_SYNTHETIC_FLAGS_MASK; // mask out synthetic flags // TODO(kevinyu): Should we cached the properties like Atil does. If yes, needs mutex and mutable specifier. Class become not that simple anymore. // { From 2e2ca3f2a148aae96cb23aa1fc1bbe789753e5f6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:09:05 +0700 Subject: [PATCH 58/83] Fix CCUDAImportedSemaphore constructor --- include/nbl/video/CCUDAImportedSemaphore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index d5139a55c9..4d014b9e39 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -32,7 +32,7 @@ class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted core::smart_refctd_ptr src, CUexternalSemaphore semaphore) : m_device(std::move(device)) - , m_src(std::move(m_src)) + , m_src(std::move(src)) , m_handle(semaphore) {} ~CCUDAImportedSemaphore() override; From 8c4c91e273a9146ae6c4c3e9843ef0d37c4b4b75 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:11:56 +0700 Subject: [PATCH 59/83] Remove unused CCUDASharedSemaphore.cpp --- src/nbl/video/CCUDASharedSemaphore.cpp | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 src/nbl/video/CCUDASharedSemaphore.cpp diff --git a/src/nbl/video/CCUDASharedSemaphore.cpp b/src/nbl/video/CCUDASharedSemaphore.cpp deleted file mode 100644 index ae2291035a..0000000000 --- a/src/nbl/video/CCUDASharedSemaphore.cpp +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. -// This file is part of the "Nabla Engine". -// For conditions of distribution and use, see copyright notice in nabla.h - -#include "nbl/video/CCUDASharedSemaphore.h" -#include "nbl/video/CCUDADevice.h" - -#ifdef _NBL_COMPILE_WITH_CUDA_ -namespace nbl::video -{ -CCUDASharedSemaphore::~CCUDASharedSemaphore() -{ - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) - assert(!"Invalid code path."); -} -} - -#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file From fcec2684a22576b159e422a8fd7f2847e2a530b6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 18:20:11 +0700 Subject: [PATCH 60/83] Fix handle type for Linux --- src/nbl/video/CVulkanLogicalDevice.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CVulkanLogicalDevice.cpp b/src/nbl/video/CVulkanLogicalDevice.cpp index 74e8be47bf..5cc0dbd8f3 100644 --- a/src/nbl/video/CVulkanLogicalDevice.cpp +++ b/src/nbl/video/CVulkanLogicalDevice.cpp @@ -279,7 +279,7 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca #ifdef _WIN32 VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, #else - VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, #endif .memory = vk_deviceMemory, .handleType = static_cast(info.externalHandleType), From ac1878160267085b3d6b1d310999d99322ba0c91 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 19:05:06 +0700 Subject: [PATCH 61/83] Add missing external handle type and make the constant consistent --- include/nbl/video/CCUDADevice.h | 2 +- include/nbl/video/IDeviceMemoryAllocation.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 89449a21f0..d30e7b18c5 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -34,7 +34,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; #else - static constexpr IDeviceMemoryBacked::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryBacked::EHT_OPAQUE_FD; + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 52b541ceb5..e75acf2fd0 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -74,6 +74,7 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted enum E_EXTERNAL_HANDLE_TYPE : uint32_t { EHT_NONE = 0, + EHT_OPAQUE_FD = 0x00000001, EHT_OPAQUE_WIN32 = 0x00000002, EHT_OPAQUE_WIN32_KMT = 0x00000004, EHT_D3D11_TEXTURE = 0x00000008, From d73c851440cc2aabed9b6461bf51e22d1795eff2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 20:06:19 +0700 Subject: [PATCH 62/83] Slight fix --- include/nbl/video/CCUDADevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index d30e7b18c5..7f51443972 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -35,7 +35,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; #else static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; - static constexpr CUmemAllocationHandleType ALLOCATION_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif struct SCUDACleaner : video::ICleanup From 2c75ed882e1d1a0da1feadb8336a2fdd4f76909d Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:32:43 +0700 Subject: [PATCH 63/83] Fix indentation and refactor to be more idiomatic --- include/nbl/video/CCUDADevice.h | 4 +- include/nbl/video/CCUDAExportableMemory.h | 55 ++++++++++++---------- include/nbl/video/CCUDAImportedMemory.h | 28 +++++------ include/nbl/video/CCUDAImportedSemaphore.h | 37 +++++++-------- src/nbl/video/CCUDADevice.cpp | 10 ++-- 5 files changed, 67 insertions(+), 67 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 7f51443972..ce9d0ea3b2 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -92,9 +92,9 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted size_t roundToGranularity(CUmemLocationType location, size_t size) const; CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); - CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem); + CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem); - CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sem); + CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); protected: CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index b4df99d9f5..d96a5ad62b 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -10,7 +10,7 @@ #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." + #error "Need CUDA 9.0 SDK or higher." #endif // useful includes in the future @@ -24,41 +24,44 @@ class CCUDAMemoryMapping: public core::IReferenceCounted { }; +class CCUDADevice; + class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted { -public: - friend class CCUDADevice; + public: - CUdeviceptr getDeviceptr() const { return m_params.ptr; } + struct SCreationParams + { + size_t size; + uint32_t alignment; + CUmemLocationType location; + }; - struct SCreationParams - { - size_t size; - uint32_t alignment; - CUmemLocationType location; - }; + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + CUdeviceptr ptr; + external_handle_t externalHandle; + }; - struct SCachedCreationParams : SCreationParams - { - size_t granularSize; - CUdeviceptr ptr; - external_handle_t externalHandle; - }; + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, CUmemGenericAllocationHandle allocationHandle) + : m_device(std::move(device)) + , m_params(std::move(params)) + , m_allocationHandle(allocationHandle) + {} + ~CCUDAExportableMemory() override; - const SCreationParams& getCreationParams() const { return m_params; } + CUdeviceptr getDeviceptr() const { return m_params.ptr; } - core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + const SCreationParams& getCreationParams() const { return m_params; } -protected: + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) - : m_device(std::move(device)) - , m_params(std::move(params)) - {} - ~CCUDAExportableMemory() override; + private: - core::smart_refctd_ptr m_device; - SCachedCreationParams m_params; + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; + CUmemGenericAllocationHandle m_allocationHandle; }; } diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h index 8fbbccb31b..4e3bfcd085 100644 --- a/include/nbl/video/CCUDAImportedMemory.h +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -16,24 +16,24 @@ namespace nbl::video class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted { - public: - friend class CCUDADevice; + public: - CUexternalMemory getInternalObject() const { return m_handle; } - CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, + CUexternalMemory cuExtMem) : + m_device(device), + m_src(src), + m_handle(cuExtMem) {} - protected: - CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, - CUexternalMemory cuExtMem) : - m_device(device), - m_src(src), - m_handle(cuExtMem) {} + ~CCUDAImportedMemory() override; - ~CCUDAImportedMemory() override; + CUexternalMemory getInternalObject() const { return m_handle; } + CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalMemory m_handle; + private: + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalMemory m_handle; }; diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h index 4d014b9e39..2e5010fa2d 100644 --- a/include/nbl/video/CCUDAImportedSemaphore.h +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -9,7 +9,7 @@ #include "cuda.h" #include "nvrtc.h" #if CUDA_VERSION < 9000 - #error "Need CUDA 9.0 SDK or higher." + #error "Need CUDA 9.0 SDK or higher." #endif // useful includes in the future @@ -21,25 +21,22 @@ namespace nbl::video class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted { -public: - friend class CCUDADevice; - - CUexternalSemaphore getInternalObject() const { return m_handle; } - -protected: - - CCUDAImportedSemaphore(core::smart_refctd_ptr device, - core::smart_refctd_ptr src, - CUexternalSemaphore semaphore) - : m_device(std::move(device)) - , m_src(std::move(src)) - , m_handle(semaphore) - {} - ~CCUDAImportedSemaphore() override; - - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_src; - CUexternalSemaphore m_handle; + public: + + CUexternalSemaphore getInternalObject() const { return m_handle; } + CCUDAImportedSemaphore(core::smart_refctd_ptr device, + core::smart_refctd_ptr src, + CUexternalSemaphore semaphore) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_handle(semaphore) + {} + ~CCUDAImportedSemaphore() override; + + private: + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalSemaphore m_handle; }; } diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 3f933be988..423491df6d 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -134,12 +134,12 @@ CUresult CCUDADevice::createExportableMemory( return err; } - *outMem = core::smart_refctd_ptr(new CCUDAExportableMemory(core::smart_refctd_ptr(this), std::move(params)), core::dont_grab); + *outMem = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); return CUDA_SUCCESS; } -CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, IDeviceMemoryAllocation* mem) +CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem) { if (!mem || !outPtr) return CUDA_ERROR_INVALID_VALUE; @@ -166,11 +166,11 @@ CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr(new CCUDAImportedMemory(core::smart_refctd_ptr(this), core::smart_refctd_ptr(mem), cuExtMem), core::dont_grab); + *outPtr = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); return CUDA_SUCCESS; } -CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, ISemaphore* sema) +CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sema) { if (!sema || !outPtr) return CUDA_ERROR_INVALID_VALUE; @@ -197,7 +197,7 @@ CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr(new CCUDAImportedSemaphore(core::smart_refctd_ptr(this), core::smart_refctd_ptr(sema), cusema), core::dont_grab); + *outPtr = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); return CUDA_SUCCESS; } From 3e905e9ce2085954ec551f93707b9ff3c8f978d6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:34:03 +0700 Subject: [PATCH 64/83] Add some comment --- include/nbl/video/EApiType.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index 0726049200..7f99d40309 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,6 +13,7 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; +// TODO(kevinyu): Should I move this type and functions to its own file? using external_handle_t = #ifdef _WIN32 void* From 963a3d66732f90a77866100bd3a93c561d721cf2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:34:48 +0700 Subject: [PATCH 65/83] Fix typo --- include/nbl/video/CCUDADevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index ce9d0ea3b2..57e1e6bd53 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -97,7 +97,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); protected: - CUresult reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 423491df6d..b7aae1e3d9 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -57,7 +57,7 @@ size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; } -CUresult CCUDADevice::reserveAdrressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) { auto& cu = m_handler->getCUDAFunctionTable(); @@ -121,7 +121,7 @@ CUresult CCUDADevice::createExportableMemory( return err; } - if (auto err = reserveAdrressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + if (auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { CloseExternalHandle(params.externalHandle); cu.pcuMemRelease(mem); From 0de37b0646dc8317fd1577480ce9963406667e87 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:35:29 +0700 Subject: [PATCH 66/83] Slight improvement --- include/nbl/video/CCUDADevice.h | 2 +- src/nbl/video/CCUDADevice.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 57e1e6bd53..bf0cb7d899 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -97,7 +97,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); protected: - CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory); + CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index b7aae1e3d9..1e6d020161 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -57,7 +57,7 @@ size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; } -CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) +CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const { auto& cu = m_handler->getCUDAFunctionTable(); From d50d709a4c06370cb0281eb735ace856240d72bf Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:36:30 +0700 Subject: [PATCH 67/83] Remove unused variable --- src/nbl/video/CCUDADevice.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 1e6d020161..c523a32a98 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -144,8 +144,6 @@ CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptrgetCreationParams().memoryPropertyFlags; - auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = mem->getCreationParams().externalHandleType; From 763d173a4303eaa7794998638f01097108fb61c2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:56:31 +0700 Subject: [PATCH 68/83] Add include WIN32 include guard --- include/nbl/video/CCUDADevice.h | 4 +++- src/nbl/video/CCUDADevice.cpp | 38 +++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index bf0cb7d899..766f06c82c 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -96,9 +96,11 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); - protected: + private: CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; + CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; + friend class CCUDAHandler; CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); ~CCUDADevice(); diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index c523a32a98..11e764f2f5 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -3,7 +3,9 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDADevice.h" +#ifdef _WIN32 #include +#endif #include "nbl/video/CCUDAImportedMemory.h" @@ -39,13 +41,7 @@ CCUDADevice::CCUDADevice( for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) { - uint32_t metaData[16] = { 48 }; - CUmemAllocationProp prop = { - .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = {.type = static_cast(i), .id = m_handle }, - .win32HandleMetaData = metaData, - }; + const auto prop = getMemAllocationProp(static_cast(i)); auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); assert(CUDA_SUCCESS == re); @@ -88,6 +84,24 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } +CUmemAllocationProp CCUDADevice::getMemAllocationProp(CUmemLocationType locationType) const +{ + +#ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = {}; + metadata.Length = sizeof(OBJECT_ATTRIBUTES); +#endif + + return { + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = locationType, .id = m_handle }, +#ifdef _WIN32 + .win32HandleMetaData = &metadata, +#endif + }; +} + CUresult CCUDADevice::createExportableMemory( core::smart_refctd_ptr* outMem, CCUDAExportableMemory::SCreationParams&& inParams) @@ -99,15 +113,7 @@ CUresult CCUDADevice::createExportableMemory( auto& cu = m_handler->getCUDAFunctionTable(); - OBJECT_ATTRIBUTES metadata = {}; - metadata.Length = sizeof(OBJECT_ATTRIBUTES); - - CUmemAllocationProp prop = { - .type = CU_MEM_ALLOCATION_TYPE_PINNED, - .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = params.location, .id = m_handle }, - .win32HandleMetaData = &metadata, - }; + const auto prop = getMemAllocationProp(params.location); params.granularSize = roundToGranularity(params.location, params.size); From d71e52d6e94b141913f04eaca25f7127de5e72a2 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 23 Apr 2026 23:58:01 +0700 Subject: [PATCH 69/83] Remove unused class --- include/nbl/video/CCUDAExportableMemory.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index d96a5ad62b..aa197a2e4c 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -20,10 +20,6 @@ namespace nbl::video { -class CCUDAMemoryMapping: public core::IReferenceCounted -{ -}; - class CCUDADevice; class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted From cfad81644ea99f4786303c2fe75054cfedc07328 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 00:08:45 +0700 Subject: [PATCH 70/83] Refactor CCUDADevice api to be more consistent with vulkan device api --- include/nbl/video/CCUDADevice.h | 10 +++++--- src/nbl/video/CCUDADevice.cpp | 42 +++++++++++---------------------- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 766f06c82c..8654e81571 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -87,14 +87,18 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted } CUdevice getInternalObject() const { return m_handle; } + const CCUDAHandler* getHandler() const { return m_handler.get(); } + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + size_t roundToGranularity(CUmemLocationType location, size_t size) const; - CUresult createExportableMemory(core::smart_refctd_ptr* outMem, struct CCUDAExportableMemory::SCreationParams&& inParams); - CUresult importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem); + core::smart_refctd_ptr createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); + + core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); - CUresult importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sem); + core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); private: CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 11e764f2f5..bedc582b9f 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -102,13 +102,8 @@ CUmemAllocationProp CCUDADevice::getMemAllocationProp(CUmemLocationType location }; } -CUresult CCUDADevice::createExportableMemory( - core::smart_refctd_ptr* outMem, - CCUDAExportableMemory::SCreationParams&& inParams) +core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) { - if (!outMem) - return CUDA_ERROR_INVALID_VALUE; - CCUDAExportableMemory::SCachedCreationParams params = { inParams }; auto& cu = m_handler->getCUDAFunctionTable(); @@ -119,41 +114,37 @@ CUresult CCUDADevice::createExportableMemory( CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) - return err; + return nullptr; if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { cu.pcuMemRelease(mem); - return err; + return nullptr; } if (auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { CloseExternalHandle(params.externalHandle); cu.pcuMemRelease(mem); - return err; + return nullptr; } if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { CloseExternalHandle(params.externalHandle); - return err; + return nullptr; } - *outMem = core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); - - return CUDA_SUCCESS; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); } -CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& mem) +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - if (!mem || !outPtr) - return CUDA_ERROR_INVALID_VALUE; auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = mem->getCreationParams().externalHandleType; - if (!handleType) return CUDA_ERROR_INVALID_VALUE; + if (!handleType) return nullptr; const auto externalHandle = mem->getExternalHandle(); @@ -169,21 +160,17 @@ CUresult CCUDADevice::importExternalMemory(core::smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); - return CUDA_SUCCESS; + return nullptr; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); } -CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr* outPtr, core::smart_refctd_ptr&& sema) +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - if (!sema || !outPtr) - return CUDA_ERROR_INVALID_VALUE; - auto& cu = m_handler->getCUDAFunctionTable(); auto handleType = sema->getCreationParams().externalHandleTypes.value; if (!handleType) - return CUDA_ERROR_INVALID_VALUE; + return nullptr; CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { #ifdef _WIN32 @@ -199,10 +186,9 @@ CUresult CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); - return CUDA_SUCCESS; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); } CCUDADevice::~CCUDADevice() From b22168e6f6238b415a1d8902d1f68bd7967149fb Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 00:12:25 +0700 Subject: [PATCH 71/83] Refactor constructor parameter naming --- include/nbl/video/CCUDADevice.h | 8 ++++---- src/nbl/video/CCUDADevice.cpp | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 8654e81571..b3be2a9014 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -81,6 +81,10 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted }; inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler); + + ~CCUDADevice(); + inline core::SRange geDefaultCompileOptions() const { return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; @@ -104,10 +108,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; - - friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, CUdevice _device, core::smart_refctd_ptr&& _handler); - ~CCUDADevice(); std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index bedc582b9f..25c93222da 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -14,17 +14,17 @@ namespace nbl::video { CCUDADevice::CCUDADevice( - core::smart_refctd_ptr&& _vulkanConnection, - IPhysicalDevice* const _vulkanDevice, - const E_VIRTUAL_ARCHITECTURE _virtualArchitecture, - CUdevice _device, - core::smart_refctd_ptr&& _handler) : + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + CUdevice device, + core::smart_refctd_ptr&& handler) : m_defaultCompileOptions(), - m_vulkanConnection(std::move(_vulkanConnection)), - m_vulkanDevice(_vulkanDevice), - m_virtualArchitecture(_virtualArchitecture), - m_handle(_device), - m_handler(std::move(_handler)), + m_vulkanConnection(std::move(vulkanConnection)), + m_vulkanDevice(vulkanDevice), + m_virtualArchitecture(virtualArchitecture), + m_handle(device), + m_handler(std::move(handler)), m_allocationGranularity{} { m_defaultCompileOptions.push_back("--std=c++14"); From 5bd64ae2e9540d539d15b2aeeb3dee7afc7446ee Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 00:14:07 +0700 Subject: [PATCH 72/83] Idiomatic way to create core::smart_refctd_ptr --- src/nbl/video/CCUDAHandler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 770db41946..cee01b976b 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -664,7 +664,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); + return core::make_smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); } } return nullptr; From bd0f8a270f062c20aa0099d04a0a1fa8e870ce24 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 01:10:39 +0700 Subject: [PATCH 73/83] Fix destruction and remove unnecessary SCUDACleaner --- include/nbl/video/CCUDADevice.h | 7 ------- include/nbl/video/IDeviceMemoryAllocation.h | 5 ----- include/nbl/video/IDeviceMemoryAllocator.h | 6 +----- src/nbl/video/CCUDAExportableMemory.cpp | 10 +++++----- src/nbl/video/CCUDAHandler.cpp | 2 +- 5 files changed, 7 insertions(+), 23 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index b3be2a9014..ffa006b4d9 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -38,13 +38,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; #endif - struct SCUDACleaner : video::ICleanup - { - core::smart_refctd_ptr resource; - SCUDACleaner(core::smart_refctd_ptr resource) - : resource(std::move(resource)) - {} - }; enum E_VIRTUAL_ARCHITECTURE { EVA_30, diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index e75acf2fd0..6120574baa 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -191,10 +191,6 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted virtual external_handle_t getExternalHandle() const = 0; protected: - inline void setPostDestroyCleanup(std::unique_ptr&& cleanup) - { - m_postDestroyCleanup = std::move(cleanup); - } IDeviceMemoryAllocation( const ILogicalDevice* originDevice, SCreationParams&& params = {}) @@ -213,7 +209,6 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted uint8_t* m_mappedPtr = nullptr; MemoryRange m_mappedRange = {}; core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - std::unique_ptr m_postDestroyCleanup = nullptr; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 8fc07dd698..797536113c 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -125,18 +125,14 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked* dedication = nullptr, const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, - external_handle_t externalHandle = {}, - std::unique_ptr&& postDestroyCleanup = nullptr) + external_handle_t externalHandle = {}) { for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); auto allocation = allocate(allocateInfo); if (allocation.isValid()) - { - allocation.memory->setPostDestroyCleanup(std::move(postDestroyCleanup)); return allocation; - } } return {}; } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index bbe773f610..e778a46875 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -35,17 +35,17 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM dedication, IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, - m_params.externalHandle, - std::make_unique(core::smart_refctd_ptr(this))).memory; + m_params.externalHandle).memory; } CCUDAExportableMemory::~CCUDAExportableMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - CUresult re[] = { - cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), - }; + cu.pcuMemUnmap(m_params.ptr, m_params.granularSize); + cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize); + cu.pcuMemRelease(m_allocationHandle); + CloseExternalHandle(m_params.externalHandle); } diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index cee01b976b..19528d4816 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -664,7 +664,7 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - return core::make_smart_refctd_ptr(new CCUDADevice(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)),core::dont_grab); + return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)); } } return nullptr; From 6d47b9000d4ea647973c65ce75a00df45f33009e Mon Sep 17 00:00:00 2001 From: kevyuu Date: Fri, 24 Apr 2026 23:58:05 +0700 Subject: [PATCH 74/83] CCUDAHandler construction more idiomatic --- src/nbl/video/CCUDAHandler.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 19528d4816..060afe6631 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -526,9 +526,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste )); } - - CCUDAHandler* handler = new CCUDAHandler(std::move(cuda), std::move(nvrtc),std::move(headers), std::move(_logger), cudaVersion); - return core::smart_refctd_ptr(handler,core::dont_grab); + return core::make_smart_refctd_ptr(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion); } nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) From 0257d9afbff4779532a1fa0042d565ffc29ad0fc Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Apr 2026 00:31:06 +0700 Subject: [PATCH 75/83] Refactor magic number --- include/nbl/video/CCUDADevice.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index ffa006b4d9..f19a7fdae6 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -87,7 +87,7 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted const CCUDAHandler* getHandler() const { return m_handler.get(); } - bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_vulkanDevice->getProperties().deviceUUID, 16); } + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } size_t roundToGranularity(CUmemLocationType location, size_t size) const; @@ -102,15 +102,18 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; + static constexpr auto CudaMemoryLocationCount = 5; + + const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - IPhysicalDevice* const m_vulkanDevice; + IPhysicalDevice* const m_physicalDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; core::smart_refctd_ptr m_handler; CUdevice m_handle; CUcontext m_context; - size_t m_allocationGranularity[4]; + std::array m_allocationGranularity; }; } From 099999478b1574930dfafac2e22e936931452781 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Apr 2026 00:32:03 +0700 Subject: [PATCH 76/83] Remove releasing allocationHandle in destructor, since we already call it after cuMemMap --- include/nbl/video/CCUDAExportableMemory.h | 4 +--- src/nbl/video/CCUDAExportableMemory.cpp | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h index aa197a2e4c..1c3d206906 100644 --- a/include/nbl/video/CCUDAExportableMemory.h +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -40,10 +40,9 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted external_handle_t externalHandle; }; - CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params, CUmemGenericAllocationHandle allocationHandle) + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) : m_device(std::move(device)) , m_params(std::move(params)) - , m_allocationHandle(allocationHandle) {} ~CCUDAExportableMemory() override; @@ -57,7 +56,6 @@ class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted core::smart_refctd_ptr m_device; SCachedCreationParams m_params; - CUmemGenericAllocationHandle m_allocationHandle; }; } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index e778a46875..e6c6b67509 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -44,7 +44,6 @@ CCUDAExportableMemory::~CCUDAExportableMemory() cu.pcuMemUnmap(m_params.ptr, m_params.granularSize); cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize); - cu.pcuMemRelease(m_allocationHandle); CloseExternalHandle(m_params.externalHandle); From 6f4b889cbb9cbb1586b99ecb4e62adfdac6be965 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Sat, 25 Apr 2026 00:32:38 +0700 Subject: [PATCH 77/83] Input validation and error logging --- include/nbl/video/CCUDAHandler.h | 15 ++++- src/nbl/video/CCUDADevice.cpp | 72 ++++++++++++++---------- src/nbl/video/CCUDAExportableMemory.cpp | 10 ++-- src/nbl/video/CCUDAImportedMemory.cpp | 3 +- src/nbl/video/CCUDAImportedSemaphore.cpp | 3 +- 5 files changed, 63 insertions(+), 40 deletions(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 9de55914b5..602637f202 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,11 +16,13 @@ namespace nbl::video { + class NBL_API2 CCUDAHandler : public core::IReferenceCounted { public: - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); - inline bool defaultHandleResult(CUresult result) + static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); + + inline bool defaultHandleResult(CUresult result) const { core::smart_refctd_ptr logger = m_logger.get(); return defaultHandleResult(result,logger.get()); @@ -137,6 +139,7 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ,cuImportExternalSemaphore ,cuSignalExternalSemaphoresAsync ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -291,6 +294,14 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted int m_version; }; +#define ASSERT_CUDA_SUCCESS(expr, handler) \ + do { \ + const auto cudaResult = (expr); \ + if (!((handler)->defaultHandleResult(cudaResult))) { \ + assert(false); \ + } \ + } while(0) + } #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 25c93222da..83cbb2573a 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -19,9 +19,10 @@ CCUDADevice::CCUDADevice( const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler) : + m_logger(vulkanDevice->getDebugCallback()->getLogger()), m_defaultCompileOptions(), m_vulkanConnection(std::move(vulkanConnection)), - m_vulkanDevice(vulkanDevice), + m_physicalDevice(vulkanDevice), m_virtualArchitecture(virtualArchitecture), m_handle(device), m_handler(std::move(handler)), @@ -32,19 +33,15 @@ CCUDADevice::CCUDADevice( m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); - auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = m_handler->getCUDAFunctionTable(); - CUresult re = cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle); - assert(CUDA_SUCCESS == re); - re = cu.pcuCtxSetCurrent(m_context); - assert(CUDA_SUCCESS == re); + ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler); - for (uint32_t i = 0; i < ARRAYSIZE(m_allocationGranularity); ++i) + for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { - const auto prop = getMemAllocationProp(static_cast(i)); - auto re = cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[i], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); - - assert(CUDA_SUCCESS == re); + const auto prop = getMemAllocationProp(static_cast(locationType)); + ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); } } @@ -55,15 +52,15 @@ size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const { - auto& cu = m_handler->getCUDAFunctionTable(); + const auto& cu = m_handler->getCUDAFunctionTable(); CUdeviceptr ptr = 0; - if (auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) + if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) return err; - if (auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) + if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - cu.pcuMemAddressFree(ptr, size); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); return err; } @@ -74,8 +71,8 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) { - cu.pcuMemUnmap(ptr, size); - cu.pcuMemAddressFree(ptr, size); + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); return err; } @@ -114,35 +111,44 @@ core::smart_refctd_ptr CCUDADevice::createExportableMemor CUmemGenericAllocationHandle mem; if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) + { + m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); return nullptr; + } if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) { - cu.pcuMemRelease(mem); + m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); return nullptr; } - if (auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) + if (const auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - CloseExternalHandle(params.externalHandle); - cu.pcuMemRelease(mem); + m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); + + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + + bool closeSucceed = CloseExternalHandle(params.externalHandle); + assert(closeSucceed); + return nullptr; } - if (auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) + if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) { - CloseExternalHandle(params.externalHandle); + bool closeSucceed = CloseExternalHandle(params.externalHandle); + assert(closeSucceed); return nullptr; } - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params), mem); + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params)); } core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - - auto& cu = m_handler->getCUDAFunctionTable(); - auto handleType = mem->getCreationParams().externalHandleType; + const auto& cu = m_handler->getCUDAFunctionTable(); + const auto handleType = mem->getCreationParams().externalHandleType; if (!handleType) return nullptr; @@ -159,8 +165,11 @@ core::smart_refctd_ptr CCUDADevice::importExternalMemory(co extMemDesc.size = mem->getAllocationSize(); CUexternalMemory cuExtMem; - if (auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + if (const auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + { + m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); return nullptr; + } return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); } @@ -185,15 +194,18 @@ core::smart_refctd_ptr CCUDADevice::importExternalSemaph CUexternalSemaphore cusema; - if (auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + if (const auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) + { + m_logger.log("Fail to import semaphore into CUDA!"); return nullptr; + } return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); } CCUDADevice::~CCUDADevice() { - m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context); + ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler); } } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index e6c6b67509..ae64cf777f 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -40,12 +40,14 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM CCUDAExportableMemory::~CCUDAExportableMemory() { - auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - cu.pcuMemUnmap(m_params.ptr, m_params.granularSize); - cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize); + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler()); - CloseExternalHandle(m_params.externalHandle); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler()); + + bool closeSucceed = CloseExternalHandle(m_params.externalHandle); + assert(closeSucceed); } } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp index 33ba43eb28..7e21b05ef1 100644 --- a/src/nbl/video/CCUDAImportedMemory.cpp +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -24,8 +24,7 @@ CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) CCUDAImportedMemory::~CCUDAImportedMemory() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (cu.pcuDestroyExternalMemory(m_handle) != CUDA_SUCCESS) - assert(!"Invalid code path"); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler()); } } diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp index 69b851088e..0dc750a4a9 100644 --- a/src/nbl/video/CCUDAImportedSemaphore.cpp +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -11,8 +11,7 @@ namespace nbl::video CCUDAImportedSemaphore::~CCUDAImportedSemaphore() { auto& cu = m_device->getHandler()->getCUDAFunctionTable(); - if (cu.pcuDestroyExternalSemaphore(m_handle) != CUDA_SUCCESS) - assert(!"Invalid code path."); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler()); } } From 129ceaca26a7c5eeebdb64d6fb355fbbb2113dc6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 13:06:33 +0700 Subject: [PATCH 78/83] Revert 6605bebf changes in tgmath impl.hlsl --- include/nbl/builtin/hlsl/tgmath/impl.hlsl | 63 ++++++++++++----------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/include/nbl/builtin/hlsl/tgmath/impl.hlsl b/include/nbl/builtin/hlsl/tgmath/impl.hlsl index 4d1a30c757..0c1dc2f458 100644 --- a/include/nbl/builtin/hlsl/tgmath/impl.hlsl +++ b/include/nbl/builtin/hlsl/tgmath/impl.hlsl @@ -197,12 +197,12 @@ struct erf_helper(NBL_FP64_LITERAL(0.254829592)); + const FloatingPoint a2 = _static_cast(NBL_FP64_LITERAL(-0.284496736)); + const FloatingPoint a3 = _static_cast(NBL_FP64_LITERAL(1.421413741)); + const FloatingPoint a4 = _static_cast(NBL_FP64_LITERAL(-1.453152027)); + const FloatingPoint a5 = _static_cast(NBL_FP64_LITERAL(1.061405429)); + const FloatingPoint p = _static_cast(NBL_FP64_LITERAL(0.3275911)); FloatingPoint _sign = FloatingPoint(sign(_x)); FloatingPoint x = abs(_x); @@ -393,10 +393,10 @@ struct erf_helper static float16_t __call(float16_t _x) { // A&S approximation to 2.5x10-5 - const float16_t a1 = float16_t(0.3480242f); - const float16_t a2 = float16_t(-0.0958798f); - const float16_t a3 = float16_t(0.7478556f); - const float16_t p = float16_t(0.47047f); + const float16_t a1 = _static_cast(0.3480242f); + const float16_t a2 = _static_cast(-0.0958798f); + const float16_t a3 = _static_cast(0.7478556f); + const float16_t p = _static_cast(0.47047f); float16_t _sign = float16_t(sign(_x)); float16_t x = abs_helper::__call(_x); @@ -414,35 +414,36 @@ struct erfInv_helper(_x, FloatingPoint(NBL_FP64_LITERAL(-0.99999)), FloatingPoint(NBL_FP64_LITERAL(0.99999))); + // TODO: maybe need to replace `FloatingPoint(NBL_FP64_LITERAL` with `_static_cast(NBL_FP64_LITERAL` to make DXC shut up + FloatingPoint x = clamp(_x, _static_cast(NBL_FP64_LITERAL(-0.99999)), _static_cast(NBL_FP64_LITERAL(0.99999))); - FloatingPoint w = -log_helper::__call((FloatingPoint(NBL_FP64_LITERAL(1.0)) - x) * (FloatingPoint(NBL_FP64_LITERAL(1.0)) + x)); + FloatingPoint w = -log_helper::__call((_static_cast(NBL_FP64_LITERAL(1.0)) - x) * (_static_cast(NBL_FP64_LITERAL(1.0)) + x)); FloatingPoint p; if (w < 5.0) { - w -= FloatingPoint(NBL_FP64_LITERAL(2.5)); - p = FloatingPoint(NBL_FP64_LITERAL(2.81022636e-08)); - p = FloatingPoint(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00021858087)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00125372503)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00417768164)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.246640727)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.50140941)) + p * w; + w -= _static_cast(NBL_FP64_LITERAL(2.5)); + p = _static_cast(NBL_FP64_LITERAL(2.81022636e-08)); + p = _static_cast(NBL_FP64_LITERAL(3.43273939e-07)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-3.5233877e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-4.39150654e-06)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00021858087)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00125372503)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00417768164)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.246640727)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.50140941)) + p * w; } else { w = sqrt_helper::__call(w) - FloatingPoint(NBL_FP64_LITERAL(3.0)); - p = FloatingPoint(NBL_FP64_LITERAL(-0.000200214257)); - p = FloatingPoint(NBL_FP64_LITERAL(0.000100950558)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00134934322)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.00367342844)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00573950773)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(-0.0076224613)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(0.00943887047)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(1.00167406)) + p * w; - p = FloatingPoint(NBL_FP64_LITERAL(2.83297682)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.000200214257)); + p = _static_cast(NBL_FP64_LITERAL(0.000100950558)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00134934322)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.00367342844)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00573950773)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(-0.0076224613)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(0.00943887047)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(1.00167406)) + p * w; + p = _static_cast(NBL_FP64_LITERAL(2.83297682)) + p * w; } return p * x; } From 2c084646347d3d9ee446572ebf039acbe89b33c6 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 13:20:14 +0700 Subject: [PATCH 79/83] Fix indentation in IDeviceMemoryAllocator.h --- include/nbl/video/IDeviceMemoryAllocator.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index 797536113c..019fbd9358 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -47,11 +47,11 @@ class NBL_API2 IDeviceMemoryAllocator core::bitflag allocateFlags, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, external_handle_t handle) : - m_allocateFlags(static_cast(allocateFlags.value)), - m_reqs(reqs), - m_handleType(handleType), + m_allocateFlags(static_cast(allocateFlags.value)), + m_reqs(reqs), + m_handleType(handleType), m_handle(handle) - {} + {} static inline uint32_t end() {return 32u;} @@ -96,7 +96,7 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, external_handle_t handle ) : - IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) + IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } From e9937576030919e49803540524e59be7bbe7d078 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 14:17:32 +0700 Subject: [PATCH 80/83] Turn off NBL_COMPILE_WITH_CUDA by default --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5be1855959..fa74e167f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,8 +70,7 @@ else() message(STATUS "Vulkan SDK is not found") endif() -# TODO(kevinyu): Turn off this flag after I finish developing the PR. -option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" ON) +option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) if(NBL_COMPILE_WITH_CUDA) find_package(CUDAToolkit REQUIRED) From dcf05522a1b1a04a594a9cf16bad59132a622456 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 15:04:41 +0700 Subject: [PATCH 81/83] Move CCUDAHandler constructor from protected to public --- include/nbl/video/CCUDAHandler.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 602637f202..61e9522a66 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -158,6 +158,8 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted ); const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;} + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + // inline core::SRange getSTDHeaders() { @@ -261,7 +263,6 @@ class NBL_API2 CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); ~CCUDAHandler() = default; From f6bf98938bb975fe056ff3b4284900e5dc77b4b8 Mon Sep 17 00:00:00 2001 From: kevyuu Date: Thu, 30 Apr 2026 23:22:38 +0700 Subject: [PATCH 82/83] Fix crash due to dangling win32metadata --- include/nbl/video/CCUDADevice.h | 2 -- src/nbl/video/CCUDADevice.cpp | 39 ++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index f19a7fdae6..02f85fdac8 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -100,8 +100,6 @@ class NBL_API2 CCUDADevice : public core::IReferenceCounted private: CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; - CUmemAllocationProp getMemAllocationProp(CUmemLocationType locationType) const; - static constexpr auto CudaMemoryLocationCount = 5; const system::logger_opt_ptr m_logger; diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 83cbb2573a..27f8f6f906 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -40,7 +40,21 @@ CCUDADevice::CCUDADevice( for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) { - const auto prop = getMemAllocationProp(static_cast(locationType)); + + #ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; + #endif + + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = static_cast(locationType), .id = m_handle }, + #ifdef _WIN32 + .win32HandleMetaData = &metadata, + #endif + }; ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); } } @@ -81,32 +95,27 @@ CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t siz return CUDA_SUCCESS; } -CUmemAllocationProp CCUDADevice::getMemAllocationProp(CUmemLocationType locationType) const +core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) { + CCUDAExportableMemory::SCachedCreationParams params = { inParams }; + + auto& cu = m_handler->getCUDAFunctionTable(); #ifdef _WIN32 - OBJECT_ATTRIBUTES metadata = {}; - metadata.Length = sizeof(OBJECT_ATTRIBUTES); + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; #endif - return { + const auto prop = CUmemAllocationProp{ .type = CU_MEM_ALLOCATION_TYPE_PINNED, .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, - .location = { .type = locationType, .id = m_handle }, + .location = { .type = params.location, .id = m_handle }, #ifdef _WIN32 .win32HandleMetaData = &metadata, #endif }; -} -core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) -{ - CCUDAExportableMemory::SCachedCreationParams params = { inParams }; - - auto& cu = m_handler->getCUDAFunctionTable(); - - const auto prop = getMemAllocationProp(params.location); - params.granularSize = roundToGranularity(params.location, params.size); CUmemGenericAllocationHandle mem; From 0d237c08b5e183074d6307e57e21f8e86546fcdd Mon Sep 17 00:00:00 2001 From: kevyuu Date: Mon, 4 May 2026 14:25:19 +0700 Subject: [PATCH 83/83] Implement vk flag for HOST_NUMA and HOST_NUMA_CURRENT --- src/nbl/video/CCUDAExportableMemory.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp index ae64cf777f..66cbbdcf4f 100644 --- a/src/nbl/video/CCUDAExportableMemory.cpp +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -17,11 +17,10 @@ core::smart_refctd_ptr CCUDAExportableMemory::exportAsM switch (m_params.location) { - case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; - // TODO(Atil): Figure out how to handle these case CU_MEM_LOCATION_TYPE_HOST_NUMA: case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; default: break; }