diff --git a/3rdparty/jitify b/3rdparty/jitify index 0d6dbd8ccd..1a0ca0e837 160000 --- a/3rdparty/jitify +++ b/3rdparty/jitify @@ -1 +1 @@ -Subproject commit 0d6dbd8ccd07e6bfc811d363a54912dfc6d4799a +Subproject commit 1a0ca0e837405506f3b8f7883bacb71c20d86d96 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ba3410075..fa74e167f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,10 +74,10 @@ option(NBL_COMPILE_WITH_CUDA "Compile with CUDA interop?" OFF) if(NBL_COMPILE_WITH_CUDA) find_package(CUDAToolkit REQUIRED) - if(${CUDAToolkit_VERSION} VERSION_GREATER "9.0") - message(STATUS "CUDA version 9.0+ found!") + if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") + message(STATUS "CUDA version ${CUDAToolkit_VERSION} found!") else() - message(FATAL_ERROR "CUDA version 9.0+ needed for C++14 support!") + message(FATAL_ERROR "CUDA version 13.0+ needed for C++14 support!") endif() endif() diff --git a/examples_tests b/examples_tests index eebde787c2..93ca5efe58 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit eebde787c233367ade8eb0580bc79c0d562e97aa +Subproject commit 93ca5efe588ca85c1eaf81a486b611df98403580 diff --git a/include/nbl/asset/IBuffer.h b/include/nbl/asset/IBuffer.h index 3a7cbb5983..99f85e0b72 100644 --- a/include/nbl/asset/IBuffer.h +++ b/include/nbl/asset/IBuffer.h @@ -42,6 +42,8 @@ class IBuffer : public IDescriptor, public core::IBuffer //! synthetic Nabla inventions // whether `IGPUCommandBuffer::updateBuffer` can be used on this buffer EUF_INLINE_UPDATE_VIA_CMDBUF = 0x80000000u, + + EUF_SYNTHETIC_FLAGS_MASK = EUF_INLINE_UPDATE_VIA_CMDBUF | 0 /* fill out as needed if anymore synthethic flags are added*/ }; //! diff --git a/include/nbl/video/CCUDADevice.h b/include/nbl/video/CCUDADevice.h index 1120224fdb..02f85fdac8 100644 --- a/include/nbl/video/CCUDADevice.h +++ b/include/nbl/video/CCUDADevice.h @@ -6,6 +6,9 @@ #include "nbl/video/IPhysicalDevice.h" +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDAImportedSemaphore.h" #ifdef _NBL_COMPILE_WITH_CUDA_ @@ -24,9 +27,17 @@ namespace nbl::video { class CCUDAHandler; -class CCUDADevice : public core::IReferenceCounted +class NBL_API2 CCUDADevice : public core::IReferenceCounted { - public: + public: +#ifdef _WIN32 + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_WIN32; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_WIN32; +#else + static constexpr IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE EXTERNAL_MEMORY_HANDLE_TYPE = IDeviceMemoryAllocation::EHT_OPAQUE_FD; + static constexpr CUmemAllocationHandleType ALLOCATION_HANDLE_TYPE = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif + enum E_VIRTUAL_ARCHITECTURE { EVA_30, @@ -63,132 +74,44 @@ class CCUDADevice : public core::IReferenceCounted }; inline E_VIRTUAL_ARCHITECTURE getVirtualArchitecture() {return m_virtualArchitecture;} + CCUDADevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* const vulkanDevice, const E_VIRTUAL_ARCHITECTURE virtualArchitecture, CUdevice device, core::smart_refctd_ptr&& handler); + + ~CCUDADevice(); + inline core::SRange geDefaultCompileOptions() const { return {m_defaultCompileOptions.data(),m_defaultCompileOptions.data()+m_defaultCompileOptions.size()}; } - // TODO/REDO Vulkan: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXTRES__INTEROP.html - // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#vulkan-interoperability - // Watch out, use Driver API (`cu` functions) NOT the Runtime API (`cuda` functions) - // Also maybe separate this out into its own `CCUDA` class instead of nesting it here? -#if 0 - template - struct GraphicsAPIObjLink - { - GraphicsAPIObjLink() : obj(nullptr), cudaHandle(nullptr), acquired(false) - { - asImage = {nullptr}; - } - GraphicsAPIObjLink(core::smart_refctd_ptr&& _obj) : GraphicsAPIObjLink() - { - obj = std::move(_obj); - } - GraphicsAPIObjLink(GraphicsAPIObjLink&& other) : GraphicsAPIObjLink() - { - operator=(std::move(other)); - } - - GraphicsAPIObjLink(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(const GraphicsAPIObjLink& other) = delete; - GraphicsAPIObjLink& operator=(GraphicsAPIObjLink&& other) - { - std::swap(obj,other.obj); - std::swap(cudaHandle,other.cudaHandle); - std::swap(acquired,other.acquired); - std::swap(asImage,other.asImage); - return *this; - } - - ~GraphicsAPIObjLink() - { - assert(!acquired); // you've fucked up, there's no way for us to fix it, you need to release the objects on a proper stream - if (obj) - CCUDAHandler::cuda.pcuGraphicsUnregisterResource(cudaHandle); - } - - // - auto* getObject() const {return obj.get();} - - private: - core::smart_refctd_ptr obj; - CUgraphicsResource cudaHandle; - bool acquired; - - friend class CCUDAHandler; - public: - union - { - struct - { - CUdeviceptr pointer; - } asBuffer; - struct - { - CUmipmappedArray mipmappedArray; - CUarray array; - } asImage; - }; - }; + CUdevice getInternalObject() const { return m_handle; } - // - static CUresult registerBuffer(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - static CUresult registerImage(GraphicsAPIObjLink* link, uint32_t flags = CU_GRAPHICS_REGISTER_FLAGS_NONE); - + const CCUDAHandler* getHandler() const { return m_handler.get(); } - template - static CUresult acquireResourcesFromGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsMapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = true; - return retval; - } - template - static CUresult releaseResourcesToGraphics(void* tmpStorage, GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) - { - auto count = std::distance(linksBegin,linksEnd); - - auto resources = reinterpret_cast(tmpStorage); - auto rit = resources; - for (auto iit=linksBegin; iit!=linksEnd; iit++,rit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; - *rit = iit->cudaHandle; - } - - auto retval = cuda.pcuGraphicsUnmapResources(count,resources,stream); - for (auto iit=linksBegin; iit!=linksEnd; iit++) - iit->acquired = false; - return retval; - } + bool isMatchingDevice(const IPhysicalDevice* device) { return device && !memcmp(device->getProperties().deviceUUID, m_physicalDevice->getProperties().deviceUUID, 16); } - static CUresult acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes = nullptr); - static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream); - static CUresult acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); -#endif + size_t roundToGranularity(CUmemLocationType location, size_t size) const; + + core::smart_refctd_ptr createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams); + + core::smart_refctd_ptr importExternalMemory(core::smart_refctd_ptr&& mem); - protected: - friend class CCUDAHandler; - CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture); - ~CCUDADevice() = default; - + core::smart_refctd_ptr importExternalSemaphore(core::smart_refctd_ptr&& sem); + + private: + CUresult reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const; + + static constexpr auto CudaMemoryLocationCount = 5; + + const system::logger_opt_ptr m_logger; std::vector m_defaultCompileOptions; core::smart_refctd_ptr m_vulkanConnection; - IPhysicalDevice* const m_vulkanDevice; + IPhysicalDevice* const m_physicalDevice; E_VIRTUAL_ARCHITECTURE m_virtualArchitecture; + + core::smart_refctd_ptr m_handler; + CUdevice m_handle; + CUcontext m_context; + std::array m_allocationGranularity; }; } diff --git a/include/nbl/video/CCUDAExportableMemory.h b/include/nbl/video/CCUDAExportableMemory.h new file mode 100644 index 0000000000..1c3d206906 --- /dev/null +++ b/include/nbl/video/CCUDAExportableMemory.h @@ -0,0 +1,65 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ +#define _NBL_VIDEO_C_CUDA_EXPORTABLE_MEMORY_H_ + + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class CCUDADevice; + +class NBL_API2 CCUDAExportableMemory : public core::IReferenceCounted +{ + public: + + struct SCreationParams + { + size_t size; + uint32_t alignment; + CUmemLocationType location; + }; + + struct SCachedCreationParams : SCreationParams + { + size_t granularSize; + CUdeviceptr ptr; + external_handle_t externalHandle; + }; + + CCUDAExportableMemory(core::smart_refctd_ptr device, SCachedCreationParams&& params) + : m_device(std::move(device)) + , m_params(std::move(params)) + {} + ~CCUDAExportableMemory() override; + + CUdeviceptr getDeviceptr() const { return m_params.ptr; } + + const SCreationParams& getCreationParams() const { return m_params; } + + core::smart_refctd_ptr exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication = nullptr) const; + + private: + + core::smart_refctd_ptr m_device; + SCachedCreationParams m_params; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDAHandler.h b/include/nbl/video/CCUDAHandler.h index 01774b25d2..61e9522a66 100644 --- a/include/nbl/video/CCUDAHandler.h +++ b/include/nbl/video/CCUDAHandler.h @@ -16,11 +16,13 @@ namespace nbl::video { -class CCUDAHandler : public core::IReferenceCounted + +class NBL_API2 CCUDAHandler : public core::IReferenceCounted { - public: - static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger=nullptr); - inline bool defaultHandleResult(CUresult result) + public: + static bool defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger); + + inline bool defaultHandleResult(CUresult result) const { core::smart_refctd_ptr logger = m_logger.get(); return defaultHandleResult(result,logger.get()); @@ -34,12 +36,12 @@ class CCUDAHandler : public core::IReferenceCounted static T* cast_CUDA_ptr(CUdeviceptr ptr) { return reinterpret_cast(ptr); } // - core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); + static core::smart_refctd_ptr create(system::ISystem* system, core::smart_refctd_ptr&& _logger); // using LibLoader = system::DefaultFuncPtrLoader; NBL_SYSTEM_DECLARE_DYNAMIC_FUNCTION_CALLER_CLASS(CUDA,LibLoader - ,cuCtxCreate_v2 + ,cuCtxCreate_v4 ,cuDevicePrimaryCtxRetain ,cuDevicePrimaryCtxRelease ,cuDevicePrimaryCtxSetFlags @@ -62,7 +64,7 @@ class CCUDAHandler : public core::IReferenceCounted ,cuDeviceGet ,cuDeviceGetAttribute ,cuDeviceGetLuid - ,cuDeviceGetUuid + ,cuDeviceGetUuid_v2 ,cuDeviceTotalMem_v2 ,cuDeviceGetName ,cuDriverGetVersion @@ -119,6 +121,25 @@ class CCUDAHandler : public core::IReferenceCounted ,cuSurfObjectDestroy ,cuTexObjectCreate ,cuTexObjectDestroy + ,cuImportExternalMemory + ,cuDestroyExternalMemory + ,cuExternalMemoryGetMappedBuffer + ,cuMemUnmap + ,cuMemAddressFree + ,cuMemGetAllocationGranularity + ,cuMemAddressReserve + ,cuMemCreate + ,cuMemExportToShareableHandle + ,cuMemMap + ,cuMemRelease + ,cuMemSetAccess + ,cuMemImportFromShareableHandle + ,cuLaunchHostFunc + ,cuDestroyExternalSemaphore + ,cuImportExternalSemaphore + ,cuSignalExternalSemaphoresAsync + ,cuWaitExternalSemaphoresAsync + ,cuLogsRegisterCallback ); const CUDA& getCUDAFunctionTable() const {return m_cuda;} @@ -137,6 +158,8 @@ class CCUDAHandler : public core::IReferenceCounted ); const NVRTC& getNVRTCFunctionTable() const {return m_nvrtc;} + CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version); + // inline core::SRange getSTDHeaders() { @@ -157,13 +180,25 @@ class CCUDAHandler : public core::IReferenceCounted const auto filesize = file->getSize(); std::string source(filesize+1u,'0'); - system::future bytesRead; + system::IFile::success_t bytesRead; file->read(bytesRead,source.data(),0u,file->getSize()); - source.resize(bytesRead.get()); + source.resize(bytesRead.getBytesProcessed()); return createProgram(prog,std::move(source),file->getFileName().string().c_str(),headerCount,headerContents,includeNames); } + struct SCUDADeviceInfo + { + CUdevice handle = {}; + CUuuid uuid = {}; + int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; + }; + + inline core::vector const& getAvailableDevices() const + { + return m_availableDevices; + } + // inline nvrtcResult compileProgram(nvrtcProgram prog, core::SRange options) { @@ -228,16 +263,7 @@ class CCUDAHandler : public core::IReferenceCounted core::smart_refctd_ptr createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice); protected: - CCUDAHandler(CUDA&& _cuda, NVRTC&& _nvrtc, core::vector>&& _headers, core::smart_refctd_ptr&& _logger, int _version) - : m_cuda(std::move(_cuda)), m_nvrtc(std::move(_nvrtc)), m_headers(std::move(_headers)), m_logger(std::move(_logger)), m_version(_version) - { - for (auto& header : m_headers) - { - m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); - m_headerNamesStorage.push_back(header->getFileName().string()); - m_headerNames.push_back(m_headerNamesStorage.back().c_str()); - } - } + ~CCUDAHandler() = default; // @@ -260,6 +286,7 @@ class CCUDAHandler : public core::IReferenceCounted NVRTC m_nvrtc; // + core::vector m_availableDevices; core::vector> m_headers; core::vector m_headerContents; core::vector m_headerNamesStorage; @@ -268,6 +295,14 @@ class CCUDAHandler : public core::IReferenceCounted int m_version; }; +#define ASSERT_CUDA_SUCCESS(expr, handler) \ + do { \ + const auto cudaResult = (expr); \ + if (!((handler)->defaultHandleResult(cudaResult))) { \ + assert(false); \ + } \ + } while(0) + } #endif // _NBL_COMPILE_WITH_CUDA_ diff --git a/include/nbl/video/CCUDAImportedMemory.h b/include/nbl/video/CCUDAImportedMemory.h new file mode 100644 index 0000000000..4e3bfcd085 --- /dev/null +++ b/include/nbl/video/CCUDAImportedMemory.h @@ -0,0 +1,42 @@ +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H +#define _NBL_VIDEO_C_CUDA_IMPORTED_MEMORY_H + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +#endif // _NBL_COMPILE_WITH_CUDA + +namespace nbl::video +{ + +class NBL_API2 CCUDAImportedMemory : public core::IReferenceCounted +{ + public: + + CCUDAImportedMemory(core::smart_refctd_ptr device, core::smart_refctd_ptr src, + CUexternalMemory cuExtMem) : + m_device(device), + m_src(src), + m_handle(cuExtMem) {} + + ~CCUDAImportedMemory() override; + + CUexternalMemory getInternalObject() const { return m_handle; } + CUresult getMappedBuffer(CUdeviceptr* mappedBuffer); + + private: + + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalMemory m_handle; + +}; + +} + +#endif \ No newline at end of file diff --git a/include/nbl/video/CCUDAImportedSemaphore.h b/include/nbl/video/CCUDAImportedSemaphore.h new file mode 100644 index 0000000000..2e5010fa2d --- /dev/null +++ b/include/nbl/video/CCUDAImportedSemaphore.h @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ +#define _NBL_VIDEO_C_CUDA_IMPORTED_SEMAPHORE_H_ + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +#include "cuda.h" +#include "nvrtc.h" +#if CUDA_VERSION < 9000 + #error "Need CUDA 9.0 SDK or higher." +#endif + +// useful includes in the future +//#include "cudaEGL.h" +//#include "cudaVDPAU.h" + +namespace nbl::video +{ + +class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted +{ + public: + + CUexternalSemaphore getInternalObject() const { return m_handle; } + CCUDAImportedSemaphore(core::smart_refctd_ptr device, + core::smart_refctd_ptr src, + CUexternalSemaphore semaphore) + : m_device(std::move(device)) + , m_src(std::move(src)) + , m_handle(semaphore) + {} + ~CCUDAImportedSemaphore() override; + + private: + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_src; + CUexternalSemaphore m_handle; +}; + +} + +#endif // _NBL_COMPILE_WITH_CUDA_ + +#endif diff --git a/include/nbl/video/CVulkanDeviceMemoryBacked.h b/include/nbl/video/CVulkanDeviceMemoryBacked.h index e6d17ddf3e..696d69058f 100644 --- a/include/nbl/video/CVulkanDeviceMemoryBacked.h +++ b/include/nbl/video/CVulkanDeviceMemoryBacked.h @@ -35,11 +35,11 @@ class CVulkanDeviceMemoryBacked : public Interface protected: // special constructor for when memory requirements are known up-front (so far only swapchains and internal forwarding here) CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const IDeviceMemoryBacked::SDeviceMemoryRequirements& _memReqs, const VkResource_t vkHandle); - CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, const VkResource_t vkHandle) : - CVulkanDeviceMemoryBacked(dev,std::move(_creationParams),obtainRequirements(dev,vkHandle),vkHandle) {} + CVulkanDeviceMemoryBacked(const CVulkanLogicalDevice* dev, Interface::SCreationParams&& _creationParams, bool dedicatedOnly, const VkResource_t vkHandle) : + CVulkanDeviceMemoryBacked(dev,std::move(_creationParams), obtainRequirements(dev, dedicatedOnly, vkHandle),vkHandle) {} private: - static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle); + static IDeviceMemoryBacked::SDeviceMemoryRequirements obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle); core::smart_refctd_ptr m_memory = nullptr; size_t m_offset = 0u; diff --git a/include/nbl/video/EApiType.h b/include/nbl/video/EApiType.h index e670dc90d8..7f99d40309 100644 --- a/include/nbl/video/EApiType.h +++ b/include/nbl/video/EApiType.h @@ -13,6 +13,45 @@ enum E_API_TYPE : uint32_t //EAT_WEBGPU }; +// TODO(kevinyu): Should I move this type and functions to its own file? +using external_handle_t = +#ifdef _WIN32 +void* +#else +int +#endif +; + +#ifdef _WIN32 +constexpr external_handle_t ExternalHandleNull = nullptr; +#else +constexpr external_handle_t ExternalHandleNull = -1; +#endif + +inline bool CloseExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + return CloseHandle(handle); +#else + return (close(handle) == 0); +#endif +} + +inline external_handle_t DuplicateExternalHandle(external_handle_t handle) +{ +#ifdef _WIN32 + HANDLE re = ExternalHandleNull; + + const HANDLE cur = GetCurrentProcess(); + if (!DuplicateHandle(cur, handle, cur, &re, GENERIC_ALL, 0, DUPLICATE_SAME_ACCESS)) + return ExternalHandleNull; + + return re; +#else + return dup(handle); +#endif +} + } #endif diff --git a/include/nbl/video/IDeviceMemoryAllocation.h b/include/nbl/video/IDeviceMemoryAllocation.h index 00e55a66e3..6120574baa 100644 --- a/include/nbl/video/IDeviceMemoryAllocation.h +++ b/include/nbl/video/IDeviceMemoryAllocation.h @@ -24,6 +24,8 @@ We only support persistently mapped buffers with ARB_buffer_storage. Please don't ask us to support Buffer Orphaning. */ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { + friend class IDeviceMemoryAllocator; + friend class ILogicalDevice; public: //! Access flags for how the application plans to use mapped memory (if any) /** When you create the memory you can allow for it to be mapped (be given a pointer) @@ -68,6 +70,20 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted EMHF_MULTI_INSTANCE_BIT = 0x00000002, }; + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D11_TEXTURE = 0x00000008, + EHT_D3D11_TEXTURE_KMT = 0x00000010, + EHT_D3D12_HEAP = 0x00000020, + EHT_D3D12_RESOURCE = 0x00000040, + EHT_HOST_MAPPED_FOREIGN_MEMORY = 0x00000100, + }; + // const ILogicalDevice* getOriginDevice() const {return m_originDevice;} @@ -75,26 +91,26 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted E_API_TYPE getAPIType() const; //! Whether the allocation was made for a specific resource and is supposed to only be bound to that resource. - inline bool isDedicated() const {return m_dedicated;} + inline bool isDedicated() const {return m_params.dedicated;} //! Returns the size of the memory allocation - inline size_t getAllocationSize() const {return m_allocationSize;} + inline size_t getAllocationSize() const {return m_params.allocationSize;} //! - inline core::bitflag getAllocateFlags() const { return m_allocateFlags; } + inline core::bitflag getAllocateFlags() const { return m_params.allocateFlags; } //! - inline core::bitflag getMemoryPropertyFlags() const { return m_memoryPropertyFlags; } + inline core::bitflag getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } //! Utility function, tells whether the allocation can be mapped (whether mapMemory will ever return anything other than nullptr) - inline bool isMappable() const {return m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)||m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} + inline bool isMappable() const {return m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)|| m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT);} //! Utility function, tell us if writes by the CPU or GPU need extra visibility operations to become visible for reading on the other processor /** Only execute flushes or invalidations if the allocation requires them, and batch them (flush one combined range instead of two or more) for greater efficiency. To execute a flush or invalidation, use IDriver::flushMappedAllocationRanges and IDriver::invalidateMappedAllocationRanges respectively. */ // TODO: Visible is a misnomer, collides with Vulkan memory model nomenclature where visibility only concerns reads, where as this is both read and write (visibility and availability) inline bool haveToMakeVisible() const { - return !m_memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); + return !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_COHERENT_BIT); } //! @@ -110,9 +126,9 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted { if (isCurrentlyMapped()) return nullptr; - if(accessHint.hasFlags(EMCAF_READ) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) + if(accessHint.hasFlags(EMCAF_READ) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_READABLE_BIT)) return nullptr; - if(accessHint.hasFlags(EMCAF_WRITE) && !m_memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) + if(accessHint.hasFlags(EMCAF_WRITE) && !m_params.memoryPropertyFlags.hasFlags(EMPF_HOST_WRITABLE_BIT)) return nullptr; m_mappedPtr = reinterpret_cast(map_impl(range,accessHint)); if (m_mappedPtr) @@ -153,29 +169,50 @@ class NBL_API2 IDeviceMemoryAllocation : public virtual core::IReferenceCounted //! Constant variant of getMappedPointer inline const void* getMappedPointer() const { return m_mappedPtr; } + struct SInfo + { + uint64_t allocationSize = 0; + core::bitflag allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; + // Handle Type for external resources + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; + //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE + //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE + external_handle_t externalHandle = 0; + }; + + struct SCreationParams: SInfo + { + core::bitflag memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; + const bool dedicated = false; + }; + + inline const SCreationParams& getCreationParams() const { return m_params; } + + virtual external_handle_t getExternalHandle() const = 0; + protected: - inline IDeviceMemoryAllocation( - const ILogicalDevice* const originDevice, const size_t _size, const core::bitflag allocateFlags, const core::bitflag memoryPropertyFlags, const bool dedicated - ) : m_originDevice(originDevice), m_allocationSize(_size), m_allocateFlags(allocateFlags), m_memoryPropertyFlags(memoryPropertyFlags), m_dedicated(dedicated) {} + + IDeviceMemoryAllocation( + const ILogicalDevice* originDevice, SCreationParams&& params = {}) + : m_originDevice(originDevice) + , m_params(std::move(params)) + , m_mappedPtr(nullptr) + , m_mappedRange{ 0, 0 } + , m_currentMappingAccess(EMCAF_NO_MAPPING_ACCESS) + {} virtual void* map_impl(const MemoryRange& range, const core::bitflag accessHint) = 0; virtual bool unmap_impl() = 0; - - const ILogicalDevice* const m_originDevice; - const size_t m_allocationSize; + const ILogicalDevice* m_originDevice = nullptr; + SCreationParams m_params = {}; uint8_t* m_mappedPtr = nullptr; MemoryRange m_mappedRange = {}; core::bitflag m_currentMappingAccess = EMCAF_NO_MAPPING_ACCESS; - const core::bitflag m_allocateFlags; - const core::bitflag m_memoryPropertyFlags; - const bool m_dedicated; }; NBL_ENUM_ADD_BITWISE_OPERATORS(IDeviceMemoryAllocation::E_MEMORY_PROPERTY_FLAGS) } // end namespace nbl::video -#endif - - +#endif \ No newline at end of file diff --git a/include/nbl/video/IDeviceMemoryAllocator.h b/include/nbl/video/IDeviceMemoryAllocator.h index e85eec12a0..019fbd9358 100644 --- a/include/nbl/video/IDeviceMemoryAllocator.h +++ b/include/nbl/video/IDeviceMemoryAllocator.h @@ -15,11 +15,9 @@ class NBL_API2 IDeviceMemoryAllocator // right now we only support this interface handing out memory for one device or group virtual ILogicalDevice* getDeviceForAllocations() const = 0; - struct SAllocateInfo + struct SAllocateInfo : IDeviceMemoryAllocation::SInfo { - size_t size : 54 = 0ull; - size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS - size_t memoryTypeIndex : 5 = 0u; + size_t memoryTypeIndex = 0u; IDeviceMemoryBacked* dedication = nullptr; // if you make the info have a `dedication` the memory will be bound right away, also it will use VK_KHR_dedicated_allocation on vulkan // size_t opaqueCaptureAddress = 0u; Note that this mechanism is intended only to support capture/replay tools, and is not recommended for use in other applications. }; @@ -45,8 +43,15 @@ class NBL_API2 IDeviceMemoryAllocator class IMemoryTypeIterator { public: - IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) - : m_allocateFlags(static_cast(allocateFlags.value)), m_reqs(reqs) {} + IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + external_handle_t handle) : + m_allocateFlags(static_cast(allocateFlags.value)), + m_reqs(reqs), + m_handleType(handleType), + m_handle(handle) + {} static inline uint32_t end() {return 32u;} @@ -59,10 +64,12 @@ class NBL_API2 IDeviceMemoryAllocator inline SAllocateInfo operator()(IDeviceMemoryBacked* dedication) { SAllocateInfo ret; - ret.size = m_reqs.size; - ret.flags = m_allocateFlags; + ret.allocationSize = m_reqs.size; + ret.allocateFlags = core::bitflag(m_allocateFlags); ret.memoryTypeIndex = dereference(); ret.dedication = dedication; + ret.externalHandleType = m_handleType; + ret.externalHandle = m_handle; return ret; } @@ -75,13 +82,21 @@ class NBL_API2 IDeviceMemoryAllocator IDeviceMemoryBacked::SDeviceMemoryRequirements m_reqs; uint32_t m_allocateFlags; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE m_handleType; + external_handle_t m_handle; }; //! DefaultMemoryTypeIterator will iterate through set bits of memoryTypeBits from LSB to MSB class DefaultMemoryTypeIterator : public IMemoryTypeIterator { public: - DefaultMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag allocateFlags) : IMemoryTypeIterator(reqs, allocateFlags) + DefaultMemoryTypeIterator( + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + core::bitflag allocateFlags, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, + external_handle_t handle + ) : + IMemoryTypeIterator(reqs, allocateFlags, handleType, handle) { currentIndex = hlsl::findLSB(m_reqs.memoryTypeBits); } @@ -106,10 +121,13 @@ class NBL_API2 IDeviceMemoryAllocator template inline SAllocation allocate( - const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, IDeviceMemoryBacked* dedication=nullptr, - const core::bitflag allocateFlags=IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE) + const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, + IDeviceMemoryBacked* dedication = nullptr, + const core::bitflag allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, + external_handle_t externalHandle = {}) { - for(memory_type_iterator_t memTypeIt(reqs, allocateFlags); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) + for (memory_type_iterator_t memTypeIt(reqs, allocateFlags, externalHandleType, externalHandle); memTypeIt!=IMemoryTypeIterator::end(); ++memTypeIt) { SAllocateInfo allocateInfo = memTypeIt.operator()(dedication); auto allocation = allocate(allocateInfo); diff --git a/include/nbl/video/IDeviceMemoryBacked.h b/include/nbl/video/IDeviceMemoryBacked.h index b0c0ce05ed..04693456d7 100644 --- a/include/nbl/video/IDeviceMemoryBacked.h +++ b/include/nbl/video/IDeviceMemoryBacked.h @@ -39,6 +39,8 @@ class IDeviceMemoryBacked : public IBackendObject // Thus the destructor will skip the call to `vkDestroy` or `glDelete` on the handle, this is only useful for "imported" objects bool skipHandleDestroy = false; + core::bitflag externalHandleTypes = IDeviceMemoryAllocation::EHT_NONE; + //! If you specify multiple queue family indices, then you're concurrent sharing inline bool isConcurrentSharing() const { diff --git a/include/nbl/video/ILogicalDevice.h b/include/nbl/video/ILogicalDevice.h index 742cb506c6..3ef6d12b64 100644 --- a/include/nbl/video/ILogicalDevice.h +++ b/include/nbl/video/ILogicalDevice.h @@ -162,7 +162,7 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe IQueue::RESULT waitIdle(); //! Semaphore Stuff - virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) = 0; + virtual core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) = 0; // Waits for max timeout amout of time for the semaphores to reach a specific counter value // DOES NOT implicitly trigger Queue-refcount-resource release because of two reasons: // - the events may trigger loads of resource releases causing extra processing, whereas our `timeout` could be quite small @@ -331,39 +331,11 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe //! Descriptor Creation // Buffer (@see ICPUBuffer) - inline core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams) - { - const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; - if (creationParams.size>maxSize) - { - m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit (%u)!",system::ILogger::ELL_ERROR,creationParams.size,this,maxSize); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Buffer, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - return createBuffer_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createBuffer(IGPUBuffer::SCreationParams&& creationParams); // Create a BufferView, to a shader; a fake 1D-like texture with no interpolation (@see ICPUBufferView) core::smart_refctd_ptr createBufferView(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt); // Creates an Image (@see ICPUImage) - inline core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams) - { - if (!IGPUImage::validateCreationParameters(creationParams)) - { - m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); - return nullptr; - } - if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) - { - m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); - return nullptr; - } - // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs - return createImage_impl(std::move(creationParams)); - } + core::smart_refctd_ptr createImage(IGPUImage::SCreationParams&& creationParams); // Create an ImageView that can actually be used by shaders (@see ICPUImageView) inline core::smart_refctd_ptr createImageView(IGPUImageView::SCreationParams&& params) { @@ -1132,9 +1104,9 @@ class NBL_API2 ILogicalDevice : public core::IReferenceCounted, public IDeviceMe virtual bool bindBufferMemory_impl(const uint32_t count, const SBindBufferMemoryInfo* pInfos) = 0; virtual bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) = 0; - virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) = 0; + virtual core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) = 0; - virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) = 0; + virtual core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly = false) = 0; virtual core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) = 0; virtual core::smart_refctd_ptr createTopLevelAccelerationStructure_impl(IGPUTopLevelAccelerationStructure::SCreationParams&& params) = 0; diff --git a/include/nbl/video/IPhysicalDevice.h b/include/nbl/video/IPhysicalDevice.h index 4222a22153..e3cfe15a90 100644 --- a/include/nbl/video/IPhysicalDevice.h +++ b/include/nbl/video/IPhysicalDevice.h @@ -639,6 +639,58 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable return std::span(m_initData.qfamProperties->data(),m_initData.qfamProperties->data()+m_initData.qfamProperties->size()); } + enum E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t + { + EEMF_NONE = 0x0, + EEMF_DEDICATED_ONLY_BIT = 0x1, + EEMF_EXPORTABLE_BIT = 0x2, + EEMF_IMPORTABLE_BIT = 0x4, + }; + + struct SExternalMemoryProperties + { + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE exportableTypes : 7; + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE compatibleTypes : 7; + // TODO(kevin): This should actually be core::bitflag to be semantically correct. What should we do? Should we use bool for each flag instead of enum? + E_EXTERNAL_MEMORY_FEATURE_FLAGS features : 3; + bool operator == (SExternalMemoryProperties const& rhs) const = default; + }; + static_assert(sizeof(SExternalMemoryProperties) == sizeof(uint32_t)); + + SExternalMemoryProperties getExternalBufferProperties( + core::bitflag usages, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + usages &= ~asset::IBuffer::EUF_SYNTHETIC_FLAGS_MASK; // mask out synthetic flags + + // TODO(kevinyu): Should we cached the properties like Atil does. If yes, needs mutex and mutable specifier. Class become not that simple anymore. + // { + // std::shared_lock lock(m_externalBufferPropertiesMutex); + // auto it = m_externalBufferProperties.find({ usage, handleType }); + // if (it != m_externalBufferProperties.end()) + // return it->second; + // } + // + // std::unique_lock lock(m_externalBufferPropertiesMutex); + // return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType); + return getExternalMemoryProperties_impl(usages, handleType); + } + + struct SImageFormatInfo + { + asset::E_FORMAT format; + IGPUImage::E_TYPE type; + IGPUImage::TILING tiling; + core::bitflag usage; + core::bitflag flags; + }; + SExternalMemoryProperties getExternalImageProperties( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const + { + return getExternalMemoryProperties_impl(info, handleType); + } + struct SBufferFormatPromotionRequest { asset::E_FORMAT originalFormat = asset::EF_UNKNOWN; SFormatBufferUsages::SUsage usages = SFormatBufferUsages::SUsage(); @@ -683,6 +735,10 @@ class NBL_API2 IPhysicalDevice : public core::Interface, public core::Unmovable }; inline IPhysicalDevice(SInitData&& _initData) : m_initData(std::move(_initData)) {} + // External memory properties query + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + virtual SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const = 0; + // ILogicalDevice creation bool validateLogicalDeviceCreation(const ILogicalDevice::SCreationParams& params) const; virtual core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) = 0; diff --git a/include/nbl/video/ISemaphore.h b/include/nbl/video/ISemaphore.h index d4fbdd1756..0edc906b5d 100644 --- a/include/nbl/video/ISemaphore.h +++ b/include/nbl/video/ISemaphore.h @@ -15,6 +15,27 @@ namespace nbl::video class ISemaphore : public IBackendObject { public: + + //! Flags for imported/exported allocation + enum E_EXTERNAL_HANDLE_TYPE : uint32_t + { + EHT_NONE = 0x00000000, + EHT_OPAQUE_FD = 0x00000001, + EHT_OPAQUE_WIN32 = 0x00000002, + EHT_OPAQUE_WIN32_KMT = 0x00000004, + EHT_D3D12_FENCE = 0x00000008, + EHT_SYNC_FD = 0x00000010, + }; + + //! + struct SCachedCreationParams + { + // Handle Type for external resources + core::bitflag externalHandleTypes = EHT_NONE; + }; + + struct SCreationParams : SCachedCreationParams {}; + // basically a pool function virtual uint64_t getCounterValue() const = 0; @@ -146,9 +167,18 @@ class ISemaphore : public IBackendObject // Vulkan: const VkSemaphore* virtual const void* getNativeHandle() const = 0; + virtual external_handle_t getExternalHandle() const = 0; + + const SCachedCreationParams& getCreationParams() const { return m_creationParams; } + + + protected: - inline ISemaphore(core::smart_refctd_ptr&& dev) : IBackendObject(std::move(dev)) {} + inline ISemaphore(core::smart_refctd_ptr&& dev, SCreationParams&& creationParams) : + IBackendObject(std::move(dev)), m_creationParams(std::move(creationParams)) {} virtual ~ISemaphore() = default; + + SCachedCreationParams m_creationParams; }; } diff --git a/src/nbl/CMakeLists.txt b/src/nbl/CMakeLists.txt index 9c994bfa41..692efec8bd 100644 --- a/src/nbl/CMakeLists.txt +++ b/src/nbl/CMakeLists.txt @@ -294,6 +294,9 @@ set(NBL_VIDEO_SOURCES # CUDA video/CCUDAHandler.cpp video/CCUDADevice.cpp + video/CCUDAImportedSemaphore.cpp + video/CCUDAExportableMemory.cpp + video/CCUDAImportedMemory.cpp ) set(NBL_SCENE_SOURCES @@ -422,6 +425,10 @@ if(NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) target_compile_definitions(Nabla PUBLIC NBL_CPACK_NO_BUILD_DIRECTORY_MODULES) endif() +if(NBL_COMPILE_WITH_CUDA) + target_compile_definitions(Nabla PUBLIC _NBL_COMPILE_WITH_CUDA_) +endif() + set(INTERFACE_BUILD_DEFINITIONS _DXC_DLL_="${DXC_DLL}" ) diff --git a/src/nbl/video/CCUDADevice.cpp b/src/nbl/video/CCUDADevice.cpp index 4d2e880095..27f8f6f906 100644 --- a/src/nbl/video/CCUDADevice.cpp +++ b/src/nbl/video/CCUDADevice.cpp @@ -3,128 +3,219 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDADevice.h" +#ifdef _WIN32 +#include +#endif + +#include "nbl/video/CCUDAImportedMemory.h" + #ifdef _NBL_COMPILE_WITH_CUDA_ namespace nbl::video { -CCUDADevice::CCUDADevice(core::smart_refctd_ptr&& _vulkanConnection, IPhysicalDevice* const _vulkanDevice, const E_VIRTUAL_ARCHITECTURE _virtualArchitecture) - : m_defaultCompileOptions(), m_vulkanConnection(std::move(_vulkanConnection)), m_vulkanDevice(_vulkanDevice), m_virtualArchitecture(_virtualArchitecture) +CCUDADevice::CCUDADevice( + core::smart_refctd_ptr&& vulkanConnection, + IPhysicalDevice* const vulkanDevice, + const E_VIRTUAL_ARCHITECTURE virtualArchitecture, + CUdevice device, + core::smart_refctd_ptr&& handler) : + m_logger(vulkanDevice->getDebugCallback()->getLogger()), + m_defaultCompileOptions(), + m_vulkanConnection(std::move(vulkanConnection)), + m_physicalDevice(vulkanDevice), + m_virtualArchitecture(virtualArchitecture), + m_handle(device), + m_handler(std::move(handler)), + m_allocationGranularity{} { m_defaultCompileOptions.push_back("--std=c++14"); m_defaultCompileOptions.push_back(virtualArchCompileOption[m_virtualArchitecture]); m_defaultCompileOptions.push_back("-dc"); m_defaultCompileOptions.push_back("-use_fast_math"); -} + const auto& cu = m_handler->getCUDAFunctionTable(); + + ASSERT_CUDA_SUCCESS(cu.pcuCtxCreate_v4(&m_context, nullptr, 0, m_handle), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuCtxSetCurrent(m_context), m_handler); -#if 0 -CUresult CCUDAHandler::registerBuffer(GraphicsAPIObjLink* link, uint32_t flags) + for (uint32_t locationType = 0; locationType < m_allocationGranularity.size(); ++locationType) + { + + #ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; + #endif + + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = static_cast(locationType), .id = m_handle }, + #ifdef _WIN32 + .win32HandleMetaData = &metadata, + #endif + }; + ASSERT_CUDA_SUCCESS(cu.pcuMemGetAllocationGranularity(&m_allocationGranularity[locationType], &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM), m_handler); + } +} + +size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const { - assert(link->obj); - auto glbuf = static_cast(link->obj.get()); - auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags); - if (retval!=CUDA_SUCCESS) - link->obj = nullptr; - return retval; + return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; } -CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink* link, uint32_t flags) + +CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const { - assert(link->obj); - - auto format = link->obj->getCreationParameters().format; - if (asset::isBlockCompressionFormat(format) || asset::isDepthOrStencilFormat(format) || asset::isScaledFormat(format) || asset::isPlanarFormat(format)) - return CUDA_ERROR_INVALID_IMAGE; - - auto glimg = static_cast(link->obj.get()); - GLenum target = glimg->getOpenGLTarget(); - switch (target) + const auto& cu = m_handler->getCUDAFunctionTable(); + + CUdeviceptr ptr = 0; + if (const auto err = cu.pcuMemAddressReserve(&ptr, size, alignment, 0, 0); CUDA_SUCCESS != err) + return err; + + if (const auto err = cu.pcuMemMap(ptr, size, 0, memory, 0); CUDA_SUCCESS != err) { - case GL_TEXTURE_2D: - case GL_TEXTURE_2D_ARRAY: - case GL_TEXTURE_CUBE_MAP: - case GL_TEXTURE_3D: - break; - default: - return CUDA_ERROR_INVALID_IMAGE; - break; + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + return err; } - auto retval = cuda.pcuGraphicsGLRegisterImage(&link->cudaHandle,glimg->getOpenGLName(),target,flags); - if (retval != CUDA_SUCCESS) - link->obj = nullptr; - return retval; -} + + CUmemAccessDesc accessDesc = { + .location = { .type = location, .id = m_handle }, + .flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE, + }; + if (auto err = cu.pcuMemSetAccess(ptr, size, &accessDesc, 1); CUDA_SUCCESS != err) + { + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(ptr, size), m_handler); + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(ptr, size), m_handler); + return err; + } + + *outPtr = ptr; -constexpr auto MaxAquireOps = 4096u; + return CUDA_SUCCESS; +} -CUresult CCUDAHandler::acquireAndGetPointers(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream, size_t* outbufferSizes) +core::smart_refctd_ptr CCUDADevice::createExportableMemory(CCUDAExportableMemory::SCreationParams&& inParams) { - if (linksBegin+MaxAquireOpsgetCUDAFunctionTable(); + +#ifdef _WIN32 + OBJECT_ATTRIBUTES metadata = { + .Length = sizeof(OBJECT_ATTRIBUTES) + }; +#endif - CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream); - if (result != CUDA_SUCCESS) - return result; + const auto prop = CUmemAllocationProp{ + .type = CU_MEM_ALLOCATION_TYPE_PINNED, + .requestedHandleTypes = ALLOCATION_HANDLE_TYPE, + .location = { .type = params.location, .id = m_handle }, +#ifdef _WIN32 + .win32HandleMetaData = &metadata, +#endif + }; + + params.granularSize = roundToGranularity(params.location, params.size); + + CUmemGenericAllocationHandle mem; + if(auto err = cu.pcuMemCreate(&mem, params.granularSize, &prop, 0); CUDA_SUCCESS != err) + { + m_logger.log("Fail to create memory handle!", system::ILogger::ELL_ERROR); + return nullptr; + } + + if (auto err = cu.pcuMemExportToShareableHandle(¶ms.externalHandle, mem, prop.requestedHandleTypes, 0); CUDA_SUCCESS != err) + { + m_logger.log("Fail to create externalHandle!", system::ILogger::ELL_ERROR); + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); + return nullptr; + } - size_t tmp = 0xdeadbeefbadc0ffeull; - size_t* sit = outbufferSizes; - for (auto iit=linksBegin; iit!=linksEnd; iit++,sit++) + if (const auto err = reserveAddressAndMapMemory(¶ms.ptr, params.granularSize, params.alignment, params.location, mem); CUDA_SUCCESS != err) { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; + m_logger.log("Fail to reserve address and map memory!", system::ILogger::ELL_ERROR); + + ASSERT_CUDA_SUCCESS(cu.pcuMemRelease(mem), m_handler); - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedPointer_v2(&iit->asBuffer.pointer,outbufferSizes ? sit:&tmp,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; + bool closeSucceed = CloseExternalHandle(params.externalHandle); + assert(closeSucceed); + + return nullptr; } - return CUDA_SUCCESS; + + if (const auto err = cu.pcuMemRelease(mem); CUDA_SUCCESS != err) + { + bool closeSucceed = CloseExternalHandle(params.externalHandle); + assert(closeSucceed); + return nullptr; + } + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(params)); } -CUresult CCUDAHandler::acquireAndGetMipmappedArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, CUstream stream) + +core::smart_refctd_ptr CCUDADevice::importExternalMemory(core::smart_refctd_ptr&& mem) { - if (linksBegin+MaxAquireOpsgetCUDAFunctionTable(); + const auto handleType = mem->getCreationParams().externalHandleType; - CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream); - if (result != CUDA_SUCCESS) - return result; + if (!handleType) return nullptr; - for (auto iit=linksBegin; iit!=linksEnd; iit++) - { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; + const auto externalHandle = mem->getExternalHandle(); + + CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; +#ifdef _WIN32 + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32; + extMemDesc.handle.win32.handle = externalHandle; +#else + extMemDesc.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD; + extMemDesc.handle.fd = externalHandle; +#endif + extMemDesc.size = mem->getAllocationSize(); - result = cuda::CCUDAHandler::cuda.pcuGraphicsResourceGetMappedMipmappedArray(&iit->asImage.mipmappedArray,iit->cudaHandle); - if (result != CUDA_SUCCESS) - return result; + CUexternalMemory cuExtMem; + if (const auto err = cu.pcuImportExternalMemory(&cuExtMem, &extMemDesc); CUDA_SUCCESS != err) + { + m_logger.log("Fail to import external memory into CUDA!", system::ILogger::ELL_ERROR); + return nullptr; } - return CUDA_SUCCESS; + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(mem), cuExtMem); } -CUresult CCUDAHandler::acquireAndGetArray(GraphicsAPIObjLink* linksBegin, GraphicsAPIObjLink* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream) + +core::smart_refctd_ptr CCUDADevice::importExternalSemaphore(core::smart_refctd_ptr&& sema) { - if (linksBegin+MaxAquireOpsgetCUDAFunctionTable(); + auto handleType = sema->getCreationParams().externalHandleTypes.value; + + if (!handleType) + return nullptr; + + CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC desc = { +#ifdef _WIN32 + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, + // TODO(kevinyu): Fix this later. Make it compile first. + .handle = {.win32 = {.handle = sema->getExternalHandle() }}, +#else + .type = CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + .handle = {.fd = sema->getExternalHandle()} +#endif + }; - CUresult result = acquireResourcesFromGraphics(stackScratch,linksBegin,linksEnd,stream); - if (result != CUDA_SUCCESS) - return result; - auto ait = arrayIndices; - auto mit = mipLevels; - for (auto iit=linksBegin; iit!=linksEnd; iit++,ait++,mit++) + CUexternalSemaphore cusema; + if (const auto err = cu.pcuImportExternalSemaphore(&cusema, &desc); CUDA_SUCCESS != err) { - if (!iit->acquired) - return CUDA_ERROR_UNKNOWN; - - result = cuda::CCUDAHandler::cuda.pcuGraphicsSubResourceGetMappedArray(&iit->asImage.array,iit->cudaHandle,*ait,*mit); - if (result != CUDA_SUCCESS) - return result; + m_logger.log("Fail to import semaphore into CUDA!"); + return nullptr; } - return CUDA_SUCCESS; + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(sema), cusema); +} + +CCUDADevice::~CCUDADevice() +{ + ASSERT_CUDA_SUCCESS(m_handler->getCUDAFunctionTable().pcuCtxDestroy_v2(m_context), m_handler); } -#endif } diff --git a/src/nbl/video/CCUDAExportableMemory.cpp b/src/nbl/video/CCUDAExportableMemory.cpp new file mode 100644 index 0000000000..66cbbdcf4f --- /dev/null +++ b/src/nbl/video/CCUDAExportableMemory.cpp @@ -0,0 +1,54 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAExportableMemory.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ + +core::smart_refctd_ptr CCUDAExportableMemory::exportAsMemory(ILogicalDevice* device, IDeviceMemoryBacked* dedication) const +{ + auto pd = device->getPhysicalDevice(); + uint32_t memoryTypeBits = (1 << pd->getMemoryProperties().memoryTypeCount) - 1; + uint32_t vram = pd->getDeviceLocalMemoryTypeBits(); + + switch (m_params.location) + { + case CU_MEM_LOCATION_TYPE_DEVICE: memoryTypeBits &= vram; break; + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + case CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT: + case CU_MEM_LOCATION_TYPE_HOST: memoryTypeBits &= ~vram; break; + default: break; + } + + IDeviceMemoryBacked::SDeviceMemoryRequirements req = {}; + req.size = m_params.granularSize; + req.memoryTypeBits = memoryTypeBits; + req.prefersDedicatedAllocation = nullptr != dedication; + req.requiresDedicatedAllocation = nullptr != dedication; + + return device->allocate(req, + dedication, + IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, + CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, + m_params.externalHandle).memory; +} + +CCUDAExportableMemory::~CCUDAExportableMemory() +{ + const auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + + ASSERT_CUDA_SUCCESS(cu.pcuMemUnmap(m_params.ptr, m_params.granularSize), m_device->getHandler()); + + ASSERT_CUDA_SUCCESS(cu.pcuMemAddressFree(m_params.ptr, m_params.granularSize), m_device->getHandler()); + + bool closeSucceed = CloseExternalHandle(m_params.externalHandle); + assert(closeSucceed); + +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CCUDAHandler.cpp b/src/nbl/video/CCUDAHandler.cpp index 7fb60d79bf..060afe6631 100644 --- a/src/nbl/video/CCUDAHandler.cpp +++ b/src/nbl/video/CCUDAHandler.cpp @@ -3,6 +3,7 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/video/CCUDAHandler.h" +#include "nbl/system/CFileView.h" #ifdef _NBL_COMPILE_WITH_CUDA_ #include "jitify/jitify.hpp" @@ -11,6 +12,48 @@ namespace nbl::video { +CCUDAHandler::CCUDAHandler( + CUDA&& _cuda, + NVRTC&& _nvrtc, + core::vector>&& _headers, + core::smart_refctd_ptr&& _logger, + int _version) + : m_cuda(std::move(_cuda)) + , m_nvrtc(std::move(_nvrtc)) + , m_headers(std::move(_headers)) + , m_logger(std::move(_logger)) + , m_version(_version) +{ + for (auto& header : m_headers) + { + m_headerContents.push_back(reinterpret_cast(header->getMappedPointer())); + m_headerNamesStorage.push_back(header->getFileName().string()); + m_headerNames.push_back(m_headerNamesStorage.back().c_str()); + } + + int deviceCount = 0; + if (m_cuda.pcuDeviceGetCount(&deviceCount) != CUDA_SUCCESS || deviceCount <= 0) + return; + + for (int device_i = 0; device_i < deviceCount; device_i++) + { + CUdevice handle = -1; + if (m_cuda.pcuDeviceGet(&handle, device_i) != CUDA_SUCCESS || handle < 0) + continue; + + CUuuid uuid = {}; + if (m_cuda.pcuDeviceGetUuid_v2(&uuid, handle) != CUDA_SUCCESS) + continue; + + m_availableDevices.emplace_back(handle, uuid); + + int* attributes = m_availableDevices.back().attributes; + for (int i = 0; i < CU_DEVICE_ATTRIBUTE_MAX; i++) + m_cuda.pcuDeviceGetAttribute(attributes + i, static_cast(i), handle); + + } +} + bool CCUDAHandler::defaultHandleResult(CUresult result, const system::logger_opt_ptr& logger) { switch (result) @@ -410,7 +453,14 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste NVRTC nvrtc = {}; #if defined(_NBL_WINDOWS_API_) // Perpetual TODO: any new CUDA releases we need to account for? - const char* nvrtc64_versions[] = { "nvrtc64_111","nvrtc64_110","nvrtc64_102","nvrtc64_101","nvrtc64_100","nvrtc64_92","nvrtc64_91","nvrtc64_90","nvrtc64_80","nvrtc64_75","nvrtc64_70",nullptr }; + // Version List: https://developer.nvidia.com/cuda-toolkit-archive + const char* nvrtc64_versions[] = { + "nvrtc64_132", + "nvrtc64_131", + "nvrtc64_130", + nullptr + }; + const char* nvrtc64_suffices[] = {"","_","_0","_1","_2",nullptr}; for (auto verpath=nvrtc64_versions; *verpath; verpath++) { @@ -447,7 +497,7 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste int cudaVersion = 0; SAFE_CUDA_CALL(cuDriverGetVersion,&cudaVersion) - if (cudaVersion<9000) + if (cudaVersion<13000) return nullptr; // stop the pollution @@ -468,15 +518,15 @@ core::smart_refctd_ptr CCUDAHandler::create(system::ISystem* syste { const void* contents = it.second.data(); headers.push_back(core::make_smart_refctd_ptr>( - core::smart_refctd_ptr(system),it.first.c_str(), + it.first.c_str(), core::bitflag(system::IFile::ECF_READ)|system::IFile::ECF_MAPPABLE, + // ASK(kevin): What initial_modified_time should I use? Is this how this parameter is used? + std::chrono::clock_cast(std::chrono::system_clock::now()), const_cast(contents),it.second.size()+1u )); } - - CCUDAHandler* handler = new CCUDAHandler(std::move(cuda), std::move(nvrtc),std::move(headers), std::move(_logger), cudaVersion); - return core::smart_refctd_ptr(handler,core::dont_grab); + return core::make_smart_refctd_ptr(std::move(cuda),std::move(nvrtc), std::move(headers), std::move(_logger), cudaVersion); } nvrtcResult CCUDAHandler::createProgram(nvrtcProgram* prog, std::string&& source, const char* name, const int headerCount, const char* const* headerContents, const char* const* includeNames) @@ -513,8 +563,11 @@ CCUDAHandler::ptx_and_nvrtcResult_t CCUDAHandler::getPTX(nvrtcProgram prog) if (_size==0ull) return {nullptr,NVRTC_ERROR_INVALID_INPUT}; - auto ptx = asset::ICPUBuffer::create({ _size }); - return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,reinterpret_cast(ptx->getPointer()))}; + asset::ICPUBuffer::SCreationParams ptxParams = {}; + ptxParams.size = _size; + auto ptx = asset::ICPUBuffer::create(std::move(ptxParams)); + auto ptxPtr = static_cast(ptx->getPointer()); + return {std::move(ptx),m_nvrtc.pnvrtcGetPTX(prog,ptxPtr)}; } core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refctd_ptr&& vulkanConnection, IPhysicalDevice* physicalDevice) @@ -525,28 +578,13 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (std::find(devices.begin(),devices.end(),physicalDevice)==devices.end()) return nullptr; - int deviceCount = 0; - if (m_cuda.pcuDeviceGetCount(&deviceCount)!=CUDA_SUCCESS || deviceCount<=0) - return nullptr; - - for (int ordinal=0; ordinalgetLimits().deviceUUID,VK_UUID_SIZE)) + if (!memcmp(&device.uuid,&physicalDevice->getProperties().deviceUUID,VK_UUID_SIZE)) { - int attributes[CU_DEVICE_ATTRIBUTE_MAX] = {}; - for (int i=0; i(i),handle); - CCUDADevice::E_VIRTUAL_ARCHITECTURE arch = CCUDADevice::EVA_COUNT; - const int& archMajor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; - const int& archMinor = attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; + const int& archMajor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR]; + const int& archMinor = device.attributes[CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR]; switch (archMajor) { case 3: @@ -624,10 +662,9 @@ core::smart_refctd_ptr CCUDAHandler::createDevice(core::smart_refct if (arch==CCUDADevice::EVA_COUNT) continue; - auto device = new CCUDADevice(std::move(vulkanConnection),physicalDevice,arch); - return core::smart_refctd_ptr(device,core::dont_grab); - } - } + return core::make_smart_refctd_ptr(std::move(vulkanConnection), physicalDevice, arch, device.handle, core::smart_refctd_ptr(this)); + } + } return nullptr; } diff --git a/src/nbl/video/CCUDAImportedMemory.cpp b/src/nbl/video/CCUDAImportedMemory.cpp new file mode 100644 index 0000000000..7e21b05ef1 --- /dev/null +++ b/src/nbl/video/CCUDAImportedMemory.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAImportedMemory.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ + +namespace nbl::video +{ + +CUresult CCUDAImportedMemory::getMappedBuffer(CUdeviceptr* mappedBuffer) +{ + CUDA_EXTERNAL_MEMORY_BUFFER_DESC bufferDesc = {}; + bufferDesc.offset = 0; + bufferDesc.size = m_src->getAllocationSize(); + + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc); + +} + +CCUDAImportedMemory::~CCUDAImportedMemory() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalMemory(m_handle), m_device->getHandler()); +} + +} + +#endif \ No newline at end of file diff --git a/src/nbl/video/CCUDAImportedSemaphore.cpp b/src/nbl/video/CCUDAImportedSemaphore.cpp new file mode 100644 index 0000000000..0dc750a4a9 --- /dev/null +++ b/src/nbl/video/CCUDAImportedSemaphore.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#include "nbl/video/CCUDAImportedSemaphore.h" +#include "nbl/video/CCUDADevice.h" + +#ifdef _NBL_COMPILE_WITH_CUDA_ +namespace nbl::video +{ +CCUDAImportedSemaphore::~CCUDAImportedSemaphore() +{ + auto& cu = m_device->getHandler()->getCUDAFunctionTable(); + ASSERT_CUDA_SUCCESS(cu.pcuDestroyExternalSemaphore(m_handle), m_device->getHandler()); +} +} + +#endif // _NBL_COMPILE_WITH_CUDA_ \ No newline at end of file diff --git a/src/nbl/video/CVulkanBuffer.h b/src/nbl/video/CVulkanBuffer.h index 4596981c2a..944d7db205 100644 --- a/src/nbl/video/CVulkanBuffer.h +++ b/src/nbl/video/CVulkanBuffer.h @@ -16,7 +16,7 @@ class CVulkanBuffer : public CVulkanDeviceMemoryBacked using base_t = CVulkanDeviceMemoryBacked; public: - inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, const VkBuffer buffer) : base_t(dev,std::move(creationParams),buffer) {} + inline CVulkanBuffer(const CVulkanLogicalDevice* dev, IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly, const VkBuffer buffer) : base_t(dev, std::move(creationParams), dedicatedOnly, buffer) {} void setObjectDebugName(const char* label) const override; diff --git a/src/nbl/video/CVulkanCommandBuffer.cpp b/src/nbl/video/CVulkanCommandBuffer.cpp index a04b5940ce..40b20bb5d2 100644 --- a/src/nbl/video/CVulkanCommandBuffer.cpp +++ b/src/nbl/video/CVulkanCommandBuffer.cpp @@ -90,10 +90,10 @@ void fill(vk_barrier_t& out, const ResourceBarrier& in, uint32_t selfQueueFamily switch (in.ownershipOp) { case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::RELEASE: - out.dstQueueFamilyIndex = in.otherQueueFamilyIndex; + out.dstQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; case IGPUCommandBuffer::SOwnershipTransferBarrier::OWNERSHIP_OP::ACQUIRE: - out.srcQueueFamilyIndex = in.otherQueueFamilyIndex; + out.srcQueueFamilyIndex = getVkQueueIndexFrom(in.otherQueueFamilyIndex); break; } } diff --git a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp index 90b2993cb3..955885b7ae 100644 --- a/src/nbl/video/CVulkanDeviceMemoryBacked.cpp +++ b/src/nbl/video/CVulkanDeviceMemoryBacked.cpp @@ -6,7 +6,7 @@ namespace nbl::video { template -IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, const VkResource_t vkHandle) +IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked::obtainRequirements(const CVulkanLogicalDevice* device, bool dedicatedOnly, const VkResource_t vkHandle) { const std::conditional_t vk_memoryRequirementsInfo = { IsImage ? VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2:VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,nullptr,vkHandle @@ -24,8 +24,8 @@ IDeviceMemoryBacked::SDeviceMemoryRequirements CVulkanDeviceMemoryBacked CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) +core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams) { + + // TODO(kevin) : Handle importing external semaphore into Vulkan + // VkImportSemaphoreWin32HandleInfoKHR importInfo = { VK_STRUCTURE_TYPE_IMPORT_SEMAPHORE_WIN32_HANDLE_INFO_KHR }; + + VkExportSemaphoreCreateInfo exportInfo = { + VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO, + nullptr, + static_cast(creationParams.externalHandleTypes.value) + }; + VkSemaphoreTypeCreateInfoKHR type = { VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR }; - type.pNext = nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR + type.pNext = creationParams.externalHandleTypes.value ? &exportInfo : nullptr; // Each pNext member of any structure (including this one) in the pNext chain must be either NULL or a pointer to a valid instance of VkExportSemaphoreCreateInfo, VkExportSemaphoreWin32HandleInfoKHR, or VkSemaphoreTypeCreateInfo type.semaphoreType = VK_SEMAPHORE_TYPE_TIMELINE_KHR; type.initialValue = initialValue; @@ -67,11 +77,42 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createSemaphore(const u createInfo.flags = static_cast(0); // flags must be 0 VkSemaphore semaphore; - if (m_devf.vk.vkCreateSemaphore(m_vkdev,&createInfo,nullptr,&semaphore)==VK_SUCCESS) - return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this),semaphore); - else + if (!m_devf.vk.vkCreateSemaphore(m_vkdev, &createInfo, nullptr, &semaphore) == VK_SUCCESS) return nullptr; + + external_handle_t externalHandle = external_handle_t{}; + const auto handleType = static_cast(creationParams.externalHandleTypes.value); + if (handleType != 0) + { +#ifdef _WIN32 + VkSemaphoreGetWin32HandleInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, + .semaphore = semaphore, + .handleType = handleType, + }; + + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); + return nullptr; + } +#else + VkSemaphoreGetFdInfoKHR props = { + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, + .semaphore = vkSemaphore, + .handleType = handleType, + }; + if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreFdKHR(m_vkdev, &props, &externalHandle)) + { + m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); + return nullptr; + } +#endif + } + + return core::make_smart_refctd_ptr(core::smart_refctd_ptr(this), std::move(creationParams), semaphore, externalHandle); } + ISemaphore::WAIT_RESULT CVulkanLogicalDevice::waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) { using retval_t = ISemaphore::WAIT_RESULT; @@ -136,26 +177,72 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDeferredO return core::smart_refctd_ptr(reinterpret_cast(memory),core::dont_grab); } - IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAllocateInfo& info) { - IDeviceMemoryAllocator::SAllocation ret = {}; if (info.memoryTypeIndex>=m_physicalDevice->getMemoryProperties().memoryTypeCount) - return ret; + return {}; - const core::bitflag allocateFlags(info.flags); VkMemoryAllocateFlagsInfo vk_allocateFlagsInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO, nullptr }; { - if (allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) + if (info.allocateFlags.hasFlags(IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT)) vk_allocateFlagsInfo.flags |= VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT; vk_allocateFlagsInfo.deviceMask = 0u; // unused: for now } VkMemoryDedicatedAllocateInfo vk_dedicatedInfo = {VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO, nullptr}; + +#ifdef _WIN32 + VkImportMemoryWin32HandleInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + }; + + VkExportMemoryWin32HandleInfoKHR handleInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_WIN32_HANDLE_INFO_KHR, + .dwAccess = GENERIC_ALL, + }; +#else + VkImportMemoryFdInfoKHR importInfo = { + .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_FD_INFO_KHR, + .handleType = static_cast(info.externalHandleType), + .fd = info.externalHandle, + }; +#endif + + VkExportMemoryAllocateInfo exportInfo = { + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, +#ifdef _WIN32 + .pNext = &handleInfo, +#endif + .handleTypes = static_cast(info.externalHandleType), + }; + + const void** pNext = &vk_allocateFlagsInfo.pNext; + + external_handle_t externalHandle = ExternalHandleNull; + if (info.externalHandleType) + { + if (info.externalHandle) //importing + { + externalHandle = DuplicateExternalHandle(info.externalHandle); +#ifdef _WIN32 + importInfo.handle = externalHandle; +#else + importInfo.fd = externalHandle; +#endif + *pNext = &importInfo; + } + else // exporting + *pNext = &exportInfo; + pNext = (const void**)&((VkBaseInStructure*)*pNext)->pNext; + } + if(info.dedication) { // VK_KHR_dedicated_allocation is in core 1.1, no querying for support needed static_assert(MinimumVulkanApiVersion >= VK_MAKE_API_VERSION(0,1,1,0)); - vk_allocateFlagsInfo.pNext = &vk_dedicatedInfo; + *pNext = &vk_dedicatedInfo; + pNext = &vk_dedicatedInfo.pNext; + switch (info.dedication->getObjectType()) { case IDeviceMemoryBacked::EOT_BUFFER: @@ -166,22 +253,65 @@ IDeviceMemoryAllocator::SAllocation CVulkanLogicalDevice::allocate(const SAlloca break; default: assert(false); - return ret; + return {}; break; } } VkMemoryAllocateInfo vk_allocateInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, &vk_allocateFlagsInfo}; - vk_allocateInfo.allocationSize = info.size; + vk_allocateInfo.allocationSize = info.allocationSize; vk_allocateInfo.memoryTypeIndex = info.memoryTypeIndex; VkDeviceMemory vk_deviceMemory; auto vk_res = m_devf.vk.vkAllocateMemory(m_vkdev, &vk_allocateInfo, nullptr, &vk_deviceMemory); if (vk_res!=VK_SUCCESS) - return ret; + return {}; + + const bool exported = info.externalHandleType && !info.externalHandle; + + if (exported) + { +#ifdef _WIN32 + VkMemoryGetWin32HandleInfoKHR +#else + VkMemoryGetFdInfoKHR +#endif + handleInfo = { .sType = +#ifdef _WIN32 + VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, +#else + VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, +#endif + .memory = vk_deviceMemory, + .handleType = static_cast(info.externalHandleType), + }; + + /* + For handle types defined as NT handles, + the handles returned by vkGetMemoryWin32HandleKHR are owned by the application + and hold a reference to their payload. To avoid leaking resources, + the application must release ownership of them + using the CloseHandle system call when they are no longer needed. + */ + + if (VK_SUCCESS != m_devf.vk. +#ifdef _WIN32 + vkGetMemoryWin32HandleKHR +#else + vkGetMemoryFdKHR +#endif + (m_vkdev, &handleInfo, &externalHandle)) + { + m_devf.vk.vkFreeMemory(m_vkdev, vk_deviceMemory, 0); + return {}; + } + + } // automatically allocation goes out of scope and frees itself if no success later on const auto memoryPropertyFlags = m_physicalDevice->getMemoryProperties().memoryTypes[info.memoryTypeIndex].propertyFlags; - ret.memory = core::make_smart_refctd_ptr(this,info.size,allocateFlags,memoryPropertyFlags,info.dedication,vk_deviceMemory); + CVulkanMemoryAllocation::SCreationParams params = { info, memoryPropertyFlags, !!info.dedication }; + IDeviceMemoryAllocator::SAllocation ret = {}; + ret.memory = core::make_smart_refctd_ptr(this, vk_deviceMemory, externalHandle, std::move(params)); ret.offset = 0ull; // LogicalDevice doesn't suballocate, so offset is always 0, if you want to suballocate, write/use an allocator if(info.dedication) { @@ -299,11 +429,17 @@ bool CVulkanLogicalDevice::bindImageMemory_impl(const uint32_t count, const SBin } -core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) +core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) { + + VkExternalMemoryBufferCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_BUFFER_CREATE_INFO, + .handleTypes = creationParams.externalHandleTypes.value, + }; + VkBufferCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO }; // VkBufferDeviceAddressCreateInfoEXT, VkExternalMemoryBufferCreateInfo, VkVideoProfileKHR, or VkVideoProfilesKHR - vk_createInfo.pNext = nullptr; + vk_createInfo.pNext = creationParams.externalHandleTypes.value ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(0u); // Nabla doesn't support any of these flags vk_createInfo.size = static_cast(creationParams.size); vk_createInfo.usage = getVkBufferUsageFlagsFromBufferUsageFlags(creationParams.usage); @@ -319,7 +455,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBuffer_impl(IGPUB VkBuffer vk_buffer; if (m_devf.vk.vkCreateBuffer(m_vkdev,&vk_createInfo,nullptr,&vk_buffer)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(creationParams),vk_buffer); + return core::make_smart_refctd_ptr(this, std::move(creationParams), dedicatedOnly, vk_buffer); } core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) @@ -338,7 +474,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createBufferView_im return nullptr; } -core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params) +core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) { const bool hasStencil = asset::isDepthOrStencilFormat(params.format) && !asset::isDepthOnlyFormat(params.format); VkImageStencilUsageCreateInfo vk_stencilUsage = { VK_STRUCTURE_TYPE_IMAGE_STENCIL_USAGE_CREATE_INFO, nullptr }; @@ -354,7 +490,14 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_formatList[vk_formatListStruct.viewFormatCount++] = getVkFormatFromFormat(static_cast(fmt)); vk_formatListStruct.pViewFormats = vk_formatList.data(); + const bool external = params.externalHandleTypes.value; + VkExternalMemoryImageCreateInfo externalMemoryInfo = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO, + .handleTypes = params.externalHandleTypes.value, + }; + VkImageCreateInfo vk_createInfo = { VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, &vk_formatListStruct }; + vk_createInfo.pNext = external ? &externalMemoryInfo : nullptr; vk_createInfo.flags = static_cast(params.flags.value); vk_createInfo.imageType = static_cast(params.type); vk_createInfo.format = getVkFormatFromFormat(params.format); @@ -372,12 +515,13 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createImage_impl(IGPUIma vk_createInfo.sharingMode = params.isConcurrentSharing() ? VK_SHARING_MODE_CONCURRENT:VK_SHARING_MODE_EXCLUSIVE; vk_createInfo.queueFamilyIndexCount = params.queueFamilyIndexCount; vk_createInfo.pQueueFamilyIndices = params.queueFamilyIndices; - vk_createInfo.initialLayout = params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED:VK_IMAGE_LAYOUT_UNDEFINED; + // The Vulkan spec states: If the pNext chain includes a VkExternalMemoryImageCreateInfo or VkExternalMemoryImageCreateInfoNV structure whose handleTypes member is not 0, initialLayout must be VK_IMAGE_LAYOUT_UNDEFINED + vk_createInfo.initialLayout = external ? VK_IMAGE_LAYOUT_UNDEFINED : (params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED); VkImage vk_image; if (m_devf.vk.vkCreateImage(m_vkdev,&vk_createInfo,nullptr,&vk_image)!=VK_SUCCESS) return nullptr; - return core::make_smart_refctd_ptr(this,std::move(params),vk_image); + return core::make_smart_refctd_ptr(this, std::move(params), dedicatedOnly, vk_image); } core::smart_refctd_ptr CVulkanLogicalDevice::createImageView_impl(IGPUImageView::SCreationParams&& params) @@ -548,7 +692,7 @@ core::smart_refctd_ptr CVulkanLogicalDevice::createDesc vkDescSetLayoutBinding.stageFlags = getVkShaderStageFlagsFromShaderStage(binding.stageFlags); vkDescSetLayoutBinding.pImmutableSamplers = nullptr; - if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER or binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) and binding.immutableSamplers and binding.count) + if ((binding.type == asset::IDescriptor::E_TYPE::ET_SAMPLER || binding.type==asset::IDescriptor::E_TYPE::ET_COMBINED_IMAGE_SAMPLER) && binding.immutableSamplers && binding.count) { // If descriptorType is VK_DESCRIPTOR_TYPE_SAMPLER or VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, and descriptorCount is not 0 and pImmutableSamplers is not NULL: // pImmutableSamplers must be a valid pointer to an array of descriptorCount valid VkSampler handles. diff --git a/src/nbl/video/CVulkanLogicalDevice.h b/src/nbl/video/CVulkanLogicalDevice.h index e77386cb34..09213f28db 100644 --- a/src/nbl/video/CVulkanLogicalDevice.h +++ b/src/nbl/video/CVulkanLogicalDevice.h @@ -53,7 +53,7 @@ class CVulkanLogicalDevice final : public ILogicalDevice CVulkanLogicalDevice(core::smart_refctd_ptr&& api, renderdoc_api_t* const rdoc, const IPhysicalDevice* const physicalDevice, const VkDevice vkdev, const SCreationParams& params); // sync stuff - core::smart_refctd_ptr createSemaphore(const uint64_t initialValue) override; + core::smart_refctd_ptr createSemaphore(const uint64_t initialValue, ISemaphore::SCreationParams&& creationParams = {}) override; ISemaphore::WAIT_RESULT waitForSemaphores(const std::span infos, const bool waitAll, const uint64_t timeout) override; core::smart_refctd_ptr createEvent(const IEvent::CREATE_FLAGS flags) override; @@ -110,9 +110,9 @@ class CVulkanLogicalDevice final : public ILogicalDevice bool bindImageMemory_impl(const uint32_t count, const SBindImageMemoryInfo* pInfos) override; // descriptor creation - core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams) override; + core::smart_refctd_ptr createBuffer_impl(IGPUBuffer::SCreationParams&& creationParams, bool dedicatedOnly) override; core::smart_refctd_ptr createBufferView_impl(const asset::SBufferRange& underlying, const asset::E_FORMAT _fmt) override; - core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params) override; + core::smart_refctd_ptr createImage_impl(IGPUImage::SCreationParams&& params, bool dedicatedOnly) override; core::smart_refctd_ptr createImageView_impl(IGPUImageView::SCreationParams&& params) override; VkAccelerationStructureKHR createAccelerationStructure(const IGPUAccelerationStructure::SCreationParams& params, const VkAccelerationStructureTypeKHR type, const VkAccelerationStructureMotionInfoNV* motionInfo=nullptr); inline core::smart_refctd_ptr createBottomLevelAccelerationStructure_impl(IGPUAccelerationStructure::SCreationParams&& params) override diff --git a/src/nbl/video/CVulkanMemoryAllocation.cpp b/src/nbl/video/CVulkanMemoryAllocation.cpp index 5a4dfd5ff5..0ec6fc351d 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.cpp +++ b/src/nbl/video/CVulkanMemoryAllocation.cpp @@ -4,14 +4,19 @@ namespace nbl::video { CVulkanMemoryAllocation::CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle -) : IDeviceMemoryAllocation(dev,size,flags,memoryPropertyFlags,isDedicated), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle) {} + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + const external_handle_t externalHandle, + SCreationParams&& params +) : IDeviceMemoryAllocation(dev,std::move(params)), m_vulkanDevice(dev), m_deviceMemoryHandle(deviceMemoryHandle), m_externalHandle(externalHandle) {} CVulkanMemoryAllocation::~CVulkanMemoryAllocation() { + if (m_externalHandle != ExternalHandleNull) + { + bool re = CloseExternalHandle(m_externalHandle); + assert(re); + } m_vulkanDevice->getFunctionTable()->vk.vkFreeMemory(m_vulkanDevice->getInternalObject(),m_deviceMemoryHandle,nullptr); } diff --git a/src/nbl/video/CVulkanMemoryAllocation.h b/src/nbl/video/CVulkanMemoryAllocation.h index 470e914ae3..473d826595 100644 --- a/src/nbl/video/CVulkanMemoryAllocation.h +++ b/src/nbl/video/CVulkanMemoryAllocation.h @@ -15,14 +15,19 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation { public: CVulkanMemoryAllocation( - const CVulkanLogicalDevice* dev, const size_t size, - const core::bitflag flags, - const core::bitflag memoryPropertyFlags, - const bool isDedicated, const VkDeviceMemory deviceMemoryHandle + const CVulkanLogicalDevice* dev, + const VkDeviceMemory deviceMemoryHandle, + const external_handle_t externalHandle, + SCreationParams&& params ); inline VkDeviceMemory getInternalObject() const { return m_deviceMemoryHandle; } + inline external_handle_t getExternalHandle() const override + { + return m_externalHandle; + } + private: ~CVulkanMemoryAllocation(); @@ -31,6 +36,7 @@ class CVulkanMemoryAllocation : public IDeviceMemoryAllocation core::smart_refctd_ptr m_vulkanDevice; const VkDeviceMemory m_deviceMemoryHandle; + const external_handle_t m_externalHandle; }; } diff --git a/src/nbl/video/CVulkanPhysicalDevice.cpp b/src/nbl/video/CVulkanPhysicalDevice.cpp index 65a0c358cc..03647a12f2 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.cpp +++ b/src/nbl/video/CVulkanPhysicalDevice.cpp @@ -1,5 +1,6 @@ #include "nbl/video/CVulkanPhysicalDevice.h" #include "nbl/video/CVulkanLogicalDevice.h" +#include "nbl/video/IGPUImage.h" namespace nbl::video { @@ -1385,6 +1386,63 @@ std::unique_ptr CVulkanPhysicalDevice::create(core::smart #undef RETURN_NULL_PHYSICAL_DEVICE +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + assert(!(handleType & (handleType - 1))); + VkPhysicalDeviceExternalBufferInfo info = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_BUFFER_INFO, + .usage = static_cast(usages.value), + .handleType = static_cast(handleType) + }; + VkExternalBufferProperties externalProps = { VK_STRUCTURE_TYPE_EXTERNAL_BUFFER_PROPERTIES }; + vkGetPhysicalDeviceExternalBufferProperties(m_vkPhysicalDevice, &info, &externalProps); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + +IPhysicalDevice::SExternalMemoryProperties CVulkanPhysicalDevice::getExternalMemoryProperties_impl( + const SImageFormatInfo& info, + IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const +{ + VkPhysicalDeviceExternalImageFormatInfo externalImageFormatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_EXTERNAL_IMAGE_FORMAT_INFO, + .handleType = static_cast(handleType), + }; + + VkPhysicalDeviceImageFormatInfo2 formatInfo = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_IMAGE_FORMAT_INFO_2, + .pNext = &externalImageFormatInfo, + .format = getVkFormatFromFormat(info.format), + .type = static_cast(info.type), + .tiling = static_cast(info.tiling), + .usage = getVkImageUsageFlagsFromImageUsageFlags(info.usage.value, asset::isDepthOrStencilFormat(info.format)), + .flags = static_cast(info.flags.value), + }; + + VkExternalImageFormatProperties externalProps = { + .sType = VK_STRUCTURE_TYPE_EXTERNAL_IMAGE_FORMAT_PROPERTIES, + }; + VkImageFormatProperties2 props = { + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_PROPERTIES_2, + .pNext = &externalProps, + }; + + auto re = vkGetPhysicalDeviceImageFormatProperties2(m_vkPhysicalDevice, &formatInfo, &props); + assert(VK_SUCCESS == re); + + const auto& externalMemProps = externalProps.externalMemoryProperties; + return SExternalMemoryProperties{ + .exportableTypes = static_cast(externalMemProps.exportFromImportedHandleTypes), + .compatibleTypes = static_cast(externalMemProps.compatibleHandleTypes), + .features = static_cast(externalMemProps.externalMemoryFeatures) + }; +} + core::smart_refctd_ptr CVulkanPhysicalDevice::createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) { // We might alter it to account for dependancies. diff --git a/src/nbl/video/CVulkanPhysicalDevice.h b/src/nbl/video/CVulkanPhysicalDevice.h index c1552c88f1..40e0dd78fe 100644 --- a/src/nbl/video/CVulkanPhysicalDevice.h +++ b/src/nbl/video/CVulkanPhysicalDevice.h @@ -109,6 +109,10 @@ class CVulkanPhysicalDevice final : public IPhysicalDevice // [NOOP] If sparseImageFloat32AtomicMinMax is enabled, shaderImageFloat32AtomicMinMax must be enabled } + SExternalMemoryProperties getExternalMemoryProperties_impl(core::bitflag usages, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; + + SExternalMemoryProperties getExternalMemoryProperties_impl(const SImageFormatInfo& imageFormatInfo, IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType) const override; + core::smart_refctd_ptr createLogicalDevice_impl(ILogicalDevice::SCreationParams&& params) override; private: diff --git a/src/nbl/video/CVulkanSemaphore.cpp b/src/nbl/video/CVulkanSemaphore.cpp index 071c4b2843..35aefa6ebd 100644 --- a/src/nbl/video/CVulkanSemaphore.cpp +++ b/src/nbl/video/CVulkanSemaphore.cpp @@ -7,8 +7,13 @@ namespace nbl::video CVulkanSemaphore::~CVulkanSemaphore() { - const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); - vulkanDevice->getFunctionTable()->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + const CVulkanLogicalDevice* vulkanDevice = static_cast(getOriginDevice()); + auto* vk = vulkanDevice->getFunctionTable(); + vk->vk.vkDestroySemaphore(vulkanDevice->getInternalObject(), m_semaphore, nullptr); + if (m_creationParams.externalHandleTypes != EHT_NONE) + { + CloseExternalHandle(m_externalHandle); + } } uint64_t CVulkanSemaphore::getCounterValue() const diff --git a/src/nbl/video/CVulkanSemaphore.h b/src/nbl/video/CVulkanSemaphore.h index 9290110d8d..12ba147a24 100644 --- a/src/nbl/video/CVulkanSemaphore.h +++ b/src/nbl/video/CVulkanSemaphore.h @@ -15,8 +15,8 @@ class ILogicalDevice; class CVulkanSemaphore final : public ISemaphore { public: - inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, const VkSemaphore semaphore) - : ISemaphore(std::move(_vkdev)), m_semaphore(semaphore) {} + inline CVulkanSemaphore(core::smart_refctd_ptr&& _vkdev, SCreationParams&& creationParams, const VkSemaphore semaphore, const external_handle_t externalHandle) + : ISemaphore(std::move(_vkdev), std::move(creationParams)), m_semaphore(semaphore), m_externalHandle(externalHandle) {} ~CVulkanSemaphore(); uint64_t getCounterValue() const override; @@ -24,11 +24,13 @@ class CVulkanSemaphore final : public ISemaphore inline const void* getNativeHandle() const override {return &m_semaphore;} VkSemaphore getInternalObject() const {return m_semaphore;} + external_handle_t getExternalHandle() const override { return m_externalHandle; } void setObjectDebugName(const char* label) const override; private: const VkSemaphore m_semaphore; + const external_handle_t m_externalHandle; }; } diff --git a/src/nbl/video/IDeviceMemoryAllocation.cpp b/src/nbl/video/IDeviceMemoryAllocation.cpp index 058f391de1..5f05e8d928 100644 --- a/src/nbl/video/IDeviceMemoryAllocation.cpp +++ b/src/nbl/video/IDeviceMemoryAllocation.cpp @@ -14,7 +14,7 @@ IDeviceMemoryAllocation::MemoryRange IDeviceMemoryAllocation::alignNonCoherentRa { const auto alignment = m_originDevice->getPhysicalDevice()->getLimits().nonCoherentAtomSize; range.offset = core::alignDown(range.offset,alignment); - range.length = core::min(core::alignUp(range.length,alignment),m_allocationSize); + range.length = core::min(core::alignUp(range.length,alignment),m_params.allocationSize); return range; } diff --git a/src/nbl/video/ILogicalDevice.cpp b/src/nbl/video/ILogicalDevice.cpp index bee6381f7a..6c414d2e82 100644 --- a/src/nbl/video/ILogicalDevice.cpp +++ b/src/nbl/video/ILogicalDevice.cpp @@ -298,6 +298,38 @@ bool ILogicalDevice::validateMemoryBarrier(const uint32_t queueFamilyIndex, asse return true; } +core::smart_refctd_ptr ILogicalDevice::createBuffer(IGPUBuffer::SCreationParams&& creationParams) +{ + const auto maxSize = getPhysicalDeviceLimits().maxBufferSize; + if (creationParams.size > maxSize) + { + m_logger.log("Failed to create Buffer, size %d larger than Device %p's limit!", system::ILogger::ELL_ERROR, creationParams.size, this, maxSize); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalBufferProperties(creationParams.usage, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + return createBuffer_impl(std::move(creationParams), dedicatedOnly); +} IQueue::RESULT ILogicalDevice::waitIdle() { @@ -324,6 +356,50 @@ core::smart_refctd_ptr ILogicalDevice::createBufferView(const as return createBufferView_impl(underlying, _fmt); } +core::smart_refctd_ptr ILogicalDevice::createImage(IGPUImage::SCreationParams&& creationParams) +{ + if (!IGPUImage::validateCreationParameters(creationParams)) + { + m_logger.log("Failed to create Image, invalid creation parameters!",system::ILogger::ELL_ERROR); + return nullptr; + } + if (creationParams.queueFamilyIndexCount>MaxQueueFamilies) + { + m_logger.log("Failed to create Image, queue family count %d for concurrent sharing larger than our max %d!",system::ILogger::ELL_ERROR,creationParams.queueFamilyIndexCount,MaxQueueFamilies); + return nullptr; + } + + bool dedicatedOnly = false; + if (creationParams.externalHandleTypes.value) + { + core::bitflag requestedTypes = creationParams.externalHandleTypes; + + while (const auto idx = hlsl::findLSB(static_cast(requestedTypes.value)) != -1) + { + const auto handleType = static_cast(1u << idx); + requestedTypes ^= handleType; + + auto props = m_physicalDevice->getExternalImageProperties(IPhysicalDevice::SImageFormatInfo{ + .format = creationParams.format, + .type = creationParams.type, + .tiling = creationParams.tiling, + .usage = creationParams.usage, + .flags = creationParams.flags + }, handleType); + + if (!core::bitflag(props.compatibleTypes).hasFlags(creationParams.externalHandleTypes)) + { + m_logger.log("Failed to create Buffer, Incompatible external handle type", system::ILogger::ELL_ERROR); + return nullptr; + } + + dedicatedOnly |= (props.features & IPhysicalDevice::EEMF_DEDICATED_ONLY_BIT); + } + } + + // TODO: validation of creationParams against the device's limits (sample counts, etc.) see vkCreateImage docs + return createImage_impl(std::move(creationParams), dedicatedOnly); +} core::smart_refctd_ptr ILogicalDevice::compileShader(const SShaderCreationParameters& creationParams) { diff --git a/src/nbl/video/utilities/CAssetConverter.cpp b/src/nbl/video/utilities/CAssetConverter.cpp index 5bb8be8274..d397cc4567 100644 --- a/src/nbl/video/utilities/CAssetConverter.cpp +++ b/src/nbl/video/utilities/CAssetConverter.cpp @@ -2459,12 +2459,11 @@ class MetaDeviceMemoryAllocator final failures.reserve(binItemCount); // ... using allocate_flags_t = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS; - IDeviceMemoryAllocator::SAllocateInfo info = { - .size = 0xdeadbeefBADC0FFEull, // set later - .flags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT:allocate_flags_t::EMAF_NONE, - .memoryTypeIndex = memTypeIx, - .dedication = nullptr - }; + IDeviceMemoryAllocator::SAllocateInfo info; + info.allocationSize = 0xdeadbeefBADC0FFEull; // set later + info.allocateFlags = reqBin.first.needsDeviceAddress ? allocate_flags_t::EMAF_DEVICE_ADDRESS_BIT : allocate_flags_t::EMAF_NONE; + info.memoryTypeIndex = memTypeIx; + info.dedication = nullptr; // allocate in progression of combined allocations, while trying allocate as much as possible in a single allocation auto binItemsIt = binItems.begin(); for (auto firstOffsetIt=offsetsTmp.begin(); firstOffsetIt!=offsetsTmp.end(); ) @@ -2473,7 +2472,7 @@ class MetaDeviceMemoryAllocator final const size_t combinedCount = std::distance(firstOffsetIt,nextOffsetIt); const size_t lastIx = combinedCount-1; // if we take `combinedCount` starting at `firstItem` their allocation would need this size - info.size = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; + info.allocationSize = (firstOffsetIt[lastIx]-*firstOffsetIt)+getAsBase(binItemsIt[lastIx])->getMemoryReqs().size; auto allocation = m_allocator->allocate(info); if (allocation.isValid()) {