diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt new file mode 100644 index 000000000..da95550e7 --- /dev/null +++ b/73_ImageUploadBenchmark/CMakeLists.txt @@ -0,0 +1,64 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() + +set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") +set(DEPENDS + app_resources/common.hlsl + app_resources/tile_upload.comp.hlsl +) +target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS}) +set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON) + +set(SM 6_8) +set(JSON [=[ +[ + { + "INPUT": "app_resources/tile_upload.comp.hlsl", + "KEY": "snakeStore" + } +] +]=]) +string(CONFIGURE "${JSON}" JSON) + +NBL_CREATE_NSC_COMPILE_RULES( + TARGET ${EXECUTABLE_NAME}SPIRV + LINK_TO ${EXECUTABLE_NAME} + DEPENDS ${DEPENDS} + BINARY_DIR ${OUTPUT_DIRECTORY} + MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT + COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM} + OUTPUT_VAR KEYS + INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp + NAMESPACE nbl::this_example::builtin::build + INPUTS ${JSON} +) + +NBL_CREATE_RESOURCE_ARCHIVE( + NAMESPACE nbl::this_example::builtin::build + TARGET ${EXECUTABLE_NAME}_builtinsBuild + LINK_TO ${EXECUTABLE_NAME} + BIND ${OUTPUT_DIRECTORY} + BUILTINS ${KEYS} +) diff --git a/73_ImageUploadBenchmark/app_resources/common.hlsl b/73_ImageUploadBenchmark/app_resources/common.hlsl new file mode 100644 index 000000000..70155b2aa --- /dev/null +++ b/73_ImageUploadBenchmark/app_resources/common.hlsl @@ -0,0 +1,11 @@ +#include + +struct PushConstantData +{ + uint64_t deviceBufferAddress; + uint64_t dstTileLocationsAddress; + uint32_t2 dstOffset; + uint32_t srcWidth; + uint32_t srcHeight; + uint32_t tilesPerRow; +}; diff --git a/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl new file mode 100644 index 000000000..2237e1197 --- /dev/null +++ b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl @@ -0,0 +1,99 @@ +#include "common.hlsl" + +[[vk::binding(0,0)]] RWTexture2D dstImage; +[[vk::push_constant]] PushConstantData pc; + +using namespace nbl::hlsl; + +static const uint32_t TILE_SIZE = 128u; +static const uint32_t TILE_SIZE_LOG2 = 7u; +static const uint32_t TILE_SIZE_MASK = TILE_SIZE - 1u; +static const uint32_t TILE_PIXELS_LOG2 = TILE_SIZE_LOG2 * 2u; +static const uint32_t BLOCK_SIZE = 16u; +static const uint32_t BLOCK_SIZE_LOG2 = 4u; +static const uint32_t BLOCK_PIXELS_LOG2 = BLOCK_SIZE_LOG2 * 2u; +static const uint32_t BLOCKS_PER_TILE_LOG2 = TILE_SIZE_LOG2 - BLOCK_SIZE_LOG2; +static const uint32_t BLOCKS_PER_TILE = TILE_SIZE / BLOCK_SIZE; + +[numthreads(128, 4, 1)] +[shader("compute")] +void SnakeStore(uint32_t3 ID : SV_DispatchThreadID) +{ + const uint32_t2 globalPos = ID.xy; + const uint32_t2 tileCoord = globalPos >> TILE_SIZE_LOG2; + const uint32_t2 localPos = globalPos & TILE_SIZE_MASK; + const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x; + const uint32_t localLinearIdx = (localPos.y << TILE_SIZE_LOG2) + localPos.x; + const uint32_t srcPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + localLinearIdx; + const uint32_t packedDstTile = vk::RawBufferLoad(pc.dstTileLocationsAddress + tileIdx * 4u); + const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u); + const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + localPos; + + const uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + srcPixelIdx * 4u); + + dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed)); +} + +[numthreads(128, 4, 1)] +[shader("compute")] +void SnakeLoad(uint32_t3 ID : SV_DispatchThreadID) +{ + const uint32_t2 globalPos = ID.xy; + const uint32_t2 tileCoord = globalPos >> TILE_SIZE_LOG2; + const uint32_t2 localPos = globalPos & TILE_SIZE_MASK; + const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x; + const uint32_t localLinearIdx = (localPos.y << TILE_SIZE_LOG2) + localPos.x; + const uint32_t dstPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + localLinearIdx; + const uint32_t packedDstTile = vk::RawBufferLoad(pc.dstTileLocationsAddress + tileIdx * 4u); + const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u); + const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + localPos; + + vk::RawBufferStore(pc.deviceBufferAddress + dstPixelIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos]))); +} + +[numthreads(16, 16, 1)] +[shader("compute")] +void MortonStore(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID) +{ + const uint32_t2 globalBlock = GroupID.xy; + const uint32_t2 threadPos = ID.xy; + const uint32_t2 tileCoord = globalBlock >> BLOCKS_PER_TILE_LOG2; + const uint32_t2 blockCoordInTile = globalBlock & (BLOCKS_PER_TILE - 1u); + const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x; + const uint32_t blockIdxInTile = (blockCoordInTile.y << BLOCKS_PER_TILE_LOG2) + blockCoordInTile.x; + const uint32_t localLinearIdx = (threadPos.y << BLOCK_SIZE_LOG2) + threadPos.x; + const uint32_t srcPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + (blockIdxInTile << BLOCK_PIXELS_LOG2) + localLinearIdx; + const uint32_t packedDstTile = vk::RawBufferLoad(pc.dstTileLocationsAddress + tileIdx * 4u); + const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u); + + morton::code mc; + mc.value = uint16_t(localLinearIdx); + const uint32_t2 mortonLocalPos = _static_cast(mc); + const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + (blockCoordInTile << BLOCK_SIZE_LOG2) + mortonLocalPos; + + const uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + srcPixelIdx * 4u); + dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed)); +} + +[numthreads(16, 16, 1)] +[shader("compute")] +void MortonLoad(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID) +{ + const uint32_t2 globalBlock = GroupID.xy; + const uint32_t2 threadPos = ID.xy; + const uint32_t2 tileCoord = globalBlock >> BLOCKS_PER_TILE_LOG2; + const uint32_t2 blockCoordInTile = globalBlock & (BLOCKS_PER_TILE - 1u); + const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x; + const uint32_t blockIdxInTile = (blockCoordInTile.y << BLOCKS_PER_TILE_LOG2) + blockCoordInTile.x; + const uint32_t localLinearIdx = (threadPos.y << BLOCK_SIZE_LOG2) + threadPos.x; + const uint32_t dstPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + (blockIdxInTile << BLOCK_PIXELS_LOG2) + localLinearIdx; + const uint32_t packedDstTile = vk::RawBufferLoad(pc.dstTileLocationsAddress + tileIdx * 4u); + const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u); + + morton::code mc; + mc.value = uint16_t(localLinearIdx); + const uint32_t2 mortonLocalPos = _static_cast(mc); + const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + (blockCoordInTile << BLOCK_SIZE_LOG2) + mortonLocalPos; + + vk::RawBufferStore(pc.deviceBufferAddress + dstPixelIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos]))); +} diff --git a/73_ImageUploadBenchmark/config.json.template b/73_ImageUploadBenchmark/config.json.template new file mode 100644 index 000000000..12215d0bb --- /dev/null +++ b/73_ImageUploadBenchmark/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp new file mode 100644 index 000000000..4910cfbbe --- /dev/null +++ b/73_ImageUploadBenchmark/main.cpp @@ -0,0 +1,1368 @@ +#include "nbl/examples/examples.hpp" +#include "nbl/this_example/builtin/build/spirv/keys.hpp" +#include +#include + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::video; +using namespace nbl::examples; + +class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + +public: + ImageUploadBenchmarkApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; + constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); + constexpr uint32_t TOTAL_FRAMES = 1000; + + m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_PERFORMANCE); + m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_PERFORMANCE, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); + m_logger->log("Staging buffer: %u MB", ILogger::ELL_PERFORMANCE, STAGING_BUFFER_SIZE / (1024 * 1024)); + m_logger->log("Tiles per frame: %u", ILogger::ELL_PERFORMANCE, TILES_PER_FRAME); + m_logger->log("Frames in flight: %u", ILogger::ELL_PERFORMANCE, FRAMES_IN_FLIGHT); + + uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); + uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); + uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT); + + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits; + + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits & ~hostCachedBits; + + m_logger->log("Memory type bits HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", + ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits); + m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X", + ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); + + if (!hostVisibleOnlyBits) + { + m_logger->log("HOST_VISIBLE non-cached memory types not found!", ILogger::ELL_ERROR); + return false; + } + + if (!deviceLocalBits) + { + m_logger->log("DEVICE_LOCAL memory types not found!", ILogger::ELL_ERROR); + return false; + } + + m_queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT); + { + IGPUImage::SCreationParams imgParams{}; + imgParams.type = IImage::E_TYPE::ET_2D; + uint32_t tilePerRow = (uint32_t)std::sqrt(TILES_PER_FRAME); + imgParams.extent.width = TILE_SIZE * tilePerRow; + imgParams.extent.height = TILE_SIZE * tilePerRow; + imgParams.extent.depth = 1u; + imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; + imgParams.mipLevels = 1u; + imgParams.flags = IImage::ECF_NONE; + imgParams.arrayLayers = 1u; + imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; + imgParams.tiling = video::IGPUImage::TILING::OPTIMAL; + imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_STORAGE_BIT; + imgParams.preinitialized = false; + + m_destinationImage = m_device->createImage(std::move(imgParams)); + if (!m_destinationImage) + return logFail("Failed to create destination image!\n"); + + m_destinationImage->setObjectDebugName("Destination Image"); + + auto reqs = m_destinationImage->getMemoryReqs(); + reqs.memoryTypeBits &= deviceLocalBits; + + auto allocation = m_device->allocate(reqs, m_destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE); + if (!allocation.isValid()) + return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n"); + } + + //compute shader + auto loadPrecompiledShader = [&]()->smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + + auto key = nbl::this_example::builtin::build::get_spirv_key(m_physicalDevice->getLimits(), m_physicalDevice->getFeatures()); + m_logger->log("Loading shader with key: %s", ILogger::ELL_INFO, key.data()); + + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + m_logger->log("Asset bundle is empty for key: %s", ILogger::ELL_ERROR, key.data()); + return smart_refctd_ptr(nullptr); + } + + m_logger->log("Asset count: %u, asset type: %u", ILogger::ELL_INFO, assets.size(), (uint32_t)assets[0]->getAssetType()); + + auto shader = IAsset::castDown(assets[0]); + return shader; + }; + + + //Setup compute shader resources + m_logger->log("\n=== Setting up Compute Shaders (Snake + Morton) ===", ILogger::ELL_PERFORMANCE); + { + auto shaderLib = loadPrecompiledShader.operator() < "snakeStore" > (); + if (!shaderLib) + return logFail("Failed to load shader library!\n"); + + IGPUDescriptorSetLayout::SBinding dsBinding = { + .binding = 0, + .type = IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1 + }; + auto dsLayout = m_device->createDescriptorSetLayout({ &dsBinding, 1 }); + if (!dsLayout) + return logFail("Failed to create descriptor set layout!\n"); + + asset::SPushConstantRange pcRange = { + .stageFlags = hlsl::ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(SPushConstantData) + }; + + m_pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(dsLayout)); + if (!m_pipelineLayout) + return logFail("Failed to create pipeline layout!\n"); + + auto createPipeline = [&](const char* entryPoint, smart_refctd_ptr& outPipeline) -> bool + { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pipelineLayout.get(); + params.shader.shader = shaderLib.get(); + params.shader.entryPoint = entryPoint; + if (!m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &outPipeline)) + return logFail("Failed to create %s pipeline!\n", entryPoint); + return true; + }; + + if (!createPipeline("SnakeStore", m_snakeStorePipeline)) return false; + if (!createPipeline("SnakeLoad", m_snakeLoadPipeline)) return false; + if (!createPipeline("MortonStore", m_mortonStorePipeline)) return false; + if (!createPipeline("MortonLoad", m_mortonLoadPipeline)) return false; + + auto imageView = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::EUF_STORAGE_BIT, + .image = smart_refctd_ptr(m_destinationImage), + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::E_FORMAT::EF_R8G8B8A8_UNORM + }); + if (!imageView) + return logFail("Failed to create image view!\n"); + + uint32_t setCount = 1; + auto dsPool = m_device->createDescriptorPoolForDSLayouts( + IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }, &setCount); + m_ds = dsPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); + + IGPUDescriptorSet::SDescriptorInfo imgInfo = {}; + imgInfo.desc = imageView; + imgInfo.info.image.imageLayout = IGPUImage::LAYOUT::GENERAL; + + IGPUDescriptorSet::SWriteDescriptorSet dsWrite = { + .dstSet = m_ds.get(), + .binding = 0, + .arrayElement = 0, + .count = 1, + .info = &imgInfo + }; + m_device->updateDescriptorSets({ &dsWrite, 1 }, {}); + + if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits, + "Verify Staging Buffer", m_stagingBuffer, m_stagingAlloc, m_stagingMappedPtr)) + return false; + + if (!createStagingBuffer(sizeof(uint32_t), hostVisibleOnlyBits, + "Verify Dst Tile Locations", m_verifyDstTileLocationsBuffer, m_verifyDstTileLocationsAlloc, m_verifyDstTileLocationsMappedPtr)) + return false; + + if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits, + "Snake Readback Buffer", m_snakeReadbackBuffer, m_snakeReadbackAlloc, m_snakeReadbackMappedPtr)) + return false; + + if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits, + "Morton Readback Buffer", m_mortonReadbackBuffer, m_mortonReadbackAlloc, m_mortonReadbackMappedPtr)) + return false; + + { + uint32_t* pixels = static_cast(m_stagingMappedPtr); + uint32_t totalPixels = TILE_SIZE * TILE_SIZE; + for (uint32_t i = 0; i < totalPixels; i++) + { + uint8_t val = static_cast(i & 0xFF); + pixels[i] = val | (val << 8u) | (val << 16u) | (val << 24u); + } + + if (!m_stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_stagingAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->flushMappedMemoryRanges(1, &range); + } + } + { + uint32_t* dstTileLocation = static_cast(m_verifyDstTileLocationsMappedPtr); + *dstTileLocation = packDstTileLocation(0u, 0u); + + if (!m_verifyDstTileLocationsAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_verifyDstTileLocationsAlloc.memory.get(), 0, sizeof(uint32_t)); + m_device->flushMappedMemoryRanges(1, &range); + } + } + + m_cmdPool = m_device->createCommandPool( + m_queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &m_cmdbuf); + m_sem = m_device->createSemaphore(0); + + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = m_destinationImage.get(); + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} }); + } + m_cmdbuf->end(); + + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = m_sem.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + m_queue->submit({ &submitInfo, 1 }); + + ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = 1 }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + } + + m_logger->log("Setup complete. Running benchmarks.", ILogger::ELL_PERFORMANCE); + + runAllBenchmarks(); + + if (!verifyComputeShaders()) + return false; + + return true; + } + + bool keepRunning() override { return false; } + + void workLoopBody() override {} + + bool onAppTerminated() override + { + + m_logger->log("\nResults above. Waiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE); + std::this_thread::sleep_for(std::chrono::seconds(5)); + + if (m_stagingAlloc.memory) + m_stagingAlloc.memory->unmap(); + if (m_verifyDstTileLocationsAlloc.memory) + m_verifyDstTileLocationsAlloc.memory->unmap(); + if (m_snakeReadbackAlloc.memory) + m_snakeReadbackAlloc.memory->unmap(); + if (m_mortonReadbackAlloc.memory) + m_mortonReadbackAlloc.memory->unmap(); + return true; + } + +protected: + core::vector getQueueRequirements() const override + { + using flags_t = IQueue::FAMILY_FLAGS; + return { { + .requiredFlags = flags_t::GRAPHICS_BIT, + .disallowedFlags = flags_t::NONE, + .queueCount = 1, + .maxImageTransferGranularity = {1, 1, 1} + } }; + } + +private: + static constexpr uint32_t TILE_SIZE = 128; + static constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; + static constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; + + struct SPushConstantData + { + uint64_t deviceBufferAddress; + uint64_t dstTileLocationsAddress; + uint32_t dstOffsetX; + uint32_t dstOffsetY; + uint32_t srcWidth; + uint32_t srcHeight; + uint32_t tilesPerRow; + }; + + IQueue* m_queue = nullptr; + smart_refctd_ptr m_destinationImage; + smart_refctd_ptr m_snakeStorePipeline; + smart_refctd_ptr m_snakeLoadPipeline; + smart_refctd_ptr m_mortonStorePipeline; + smart_refctd_ptr m_mortonLoadPipeline; + smart_refctd_ptr m_pipelineLayout; + smart_refctd_ptr m_ds; + smart_refctd_ptr m_stagingBuffer; + smart_refctd_ptr m_verifyDstTileLocationsBuffer; + smart_refctd_ptr m_snakeReadbackBuffer; + smart_refctd_ptr m_mortonReadbackBuffer; + IDeviceMemoryAllocator::SAllocation m_stagingAlloc; + IDeviceMemoryAllocator::SAllocation m_verifyDstTileLocationsAlloc; + IDeviceMemoryAllocator::SAllocation m_snakeReadbackAlloc; + IDeviceMemoryAllocator::SAllocation m_mortonReadbackAlloc; + void* m_stagingMappedPtr = nullptr; + void* m_verifyDstTileLocationsMappedPtr = nullptr; + void* m_snakeReadbackMappedPtr = nullptr; + void* m_mortonReadbackMappedPtr = nullptr; + smart_refctd_ptr m_cmdPool; + smart_refctd_ptr m_cmdbuf; + smart_refctd_ptr m_sem; + + void runAllBenchmarks() + { + constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; + constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); + constexpr uint32_t TOTAL_FRAMES = 1000; + + uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); + uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); + uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT); + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits; + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits & ~hostCachedBits; + + m_logger->log("\n=== RUNNING BENCHMARKS ===", ILogger::ELL_PERFORMANCE); + + struct BenchmarkResult + { + const char* name; + double wallGBps; + double gpuGBps; + double memcpyGBps; + }; + std::vector results; + + //SysRAM benchmarks + { + smart_refctd_ptr benchStagingBuffer; + IDeviceMemoryAllocator::SAllocation benchStagingAlloc; + void* benchMappedPtr = nullptr; + uint32_t benchBufSize = STAGING_BUFFER_SIZE; + + if (createStagingBuffer(benchBufSize, hostVisibleOnlyBits, + "Benchmark Staging (SysRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr)) + { + m_logger->log("\n--- CopyBufferToImage (SysRAM) ---", ILogger::ELL_PERFORMANCE); + auto rCopy = runBenchmarkBufferToImageCopyCommand("CopyBufferToImage (SysRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({ "CopyBufferToImage (SysRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps }); + + m_logger->log("\n--- Snake Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE); + auto rSnake = runBenchmarkCompute("Snake Compute (SysRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_snakeStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, 128u, 4u, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({ "Snake Compute (SysRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps }); + + m_logger->log("\n--- Morton Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE); + auto rMorton = runBenchmarkCompute("Morton Compute (SysRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_mortonStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, 16u, 16u, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({ "Morton Compute (SysRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps }); + + benchStagingAlloc.memory->unmap(); + } + } + + //BAR/VRAM benchmarks (if available) + if (hostVisibleDeviceLocalBits) + { + smart_refctd_ptr benchStagingBuffer; + IDeviceMemoryAllocator::SAllocation benchStagingAlloc; + void* benchMappedPtr = nullptr; + uint32_t benchBufSize = STAGING_BUFFER_SIZE; + + if (createStagingBuffer(benchBufSize, hostVisibleDeviceLocalBits, + "Benchmark Staging (BAR/VRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr)) + { + m_logger->log("\n--- CopyBufferToImage (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE); + auto rCopy = runBenchmarkBufferToImageCopyCommand("CopyBufferToImage (BAR/VRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({ "CopyBufferToImage (BAR/VRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps }); + + m_logger->log("\n--- Snake Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE); + auto rSnake = runBenchmarkCompute("Snake Compute (BAR/VRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_snakeStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, 128u, 4u, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({ "Snake Compute (BAR/VRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps }); + + m_logger->log("\n--- Morton Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE); + auto rMorton = runBenchmarkCompute("Morton Compute (BAR/VRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_mortonStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, 16u, 16u, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({ "Morton Compute (BAR/VRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps }); + + benchStagingAlloc.memory->unmap(); + } + } + + //Summary table + m_logger->log("\n=== BENCHMARK RESULTS ===", ILogger::ELL_PERFORMANCE); + m_logger->log("%-36s | Wall GB/s | GPU GB/s | Memcpy GB/s", ILogger::ELL_PERFORMANCE, "Strategy"); + m_logger->log("-------------------------------------+-----------+----------+------------", ILogger::ELL_PERFORMANCE); + for (const auto& r : results) + { + m_logger->log("%-36s | %9.2f | %8.2f | %10.2f", ILogger::ELL_PERFORMANCE, r.name, r.wallGBps, r.gpuGBps, r.memcpyGBps); + } + m_logger->log("=====================================+===========+==========+============", ILogger::ELL_PERFORMANCE); + } + + struct BenchResult + { + double wallGBps; + double gpuGBps; + double memcpyGBps; + }; + + static uint32_t packDstTileLocation(uint32_t tileX, uint32_t tileY) + { + return (tileX & 0xffffu) | ((tileY & 0xffffu) << 16u); + } + + void generateTileCopyRegions( + IImage::SBufferCopy* outRegions, + uint32_t tilesPerFrame, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t imageWidth, + uint32_t bufferBaseOffset) + { + uint32_t tilesPerRow = imageWidth / tileSize; + for (size_t i = 0; i < tilesPerFrame; i++) + { + uint32_t tileX = (i % tilesPerRow) * tileSize; + uint32_t tileY = (i / tilesPerRow) * tileSize; + + outRegions[i].bufferOffset = bufferBaseOffset + (i * tileSizeBytes); + outRegions[i].bufferRowLength = tileSize; + outRegions[i].bufferImageHeight = tileSize; + outRegions[i].imageOffset = { tileX, tileY, 0 }; + outRegions[i].imageExtent = { tileSize, tileSize, 1 }; + outRegions[i].imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + outRegions[i].imageSubresource.mipLevel = 0; + outRegions[i].imageSubresource.baseArrayLayer = 0; + outRegions[i].imageSubresource.layerCount = 1; + } + } + + BenchResult runBenchmarkBufferToImageCopyCommand( + const char* strategyName, + IGPUBuffer* stagingBuffer, + IDeviceMemoryAllocator::SAllocation& stagingAlloc, + void* mappedPtr, + IGPUImage* destinationImage, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); + + smart_refctd_ptr queryPool; + { + IQueryPool::SCreationParams queryPoolParams = {}; + queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + queryPool = m_device->createQueryPool(queryPoolParams); + } + + std::vector> commandPools(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i] = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + } + std::vector> commandBuffers(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i]->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, + 1, + &commandBuffers[i] + ); + } + + uint64_t timelineValue = 0; + + commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = destinationImage; + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} }); + } + commandBuffers[0]->end(); + + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + queue->submit({ &submitInfo, 1 }); + + ISemaphore::SWaitInfo waitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + uint32_t partitionSize = tilesPerFrame * tileSizeBytes; + + std::vector cpuSourceData(partitionSize); + { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + uint32_t* data = reinterpret_cast(cpuSourceData.data()); + for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) + data[i] = g(); + } + std::vector> regionsPerFrame(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + regionsPerFrame[i].resize(tilesPerFrame); + uint32_t bufferOffset = i * partitionSize; + generateTileCopyRegions(regionsPerFrame[i].data(), tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); + } + + double totalWaitTime = 0.0; + double totalMemcpyTime = 0.0; + double totalRecordTime = 0.0; + double totalSubmitTime = 0.0; + + auto startTime = std::chrono::high_resolution_clock::now(); + + for (uint32_t frame = 0; frame < totalFrames; frame++) + { + uint32_t cmdBufIndex = frame % framesInFlight; + + auto t1 = std::chrono::high_resolution_clock::now(); + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + 1 + }; + m_device->blockForSemaphores({ &frameWaitInfo, 1 }); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + commandPools[cmdBufIndex]->reset(); + + uint32_t bufferOffset = cmdBufIndex * partitionSize; + void* targetPtr = static_cast(mappedPtr) + bufferOffset; + memcpy(targetPtr, cpuSourceData.data(), partitionSize); + + if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize); + m_device->flushMappedMemoryRanges(1, &range); + } + + auto t3 = std::chrono::high_resolution_clock::now(); + + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + uint32_t queryStartIndex = cmdBufIndex * 2; + commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2); + + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = IImage::LAYOUT::GENERAL; + barrier.newLayout = IImage::LAYOUT::GENERAL; + barrier.image = destinationImage; + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0); + + commandBuffers[cmdBufIndex]->copyBufferToImage( + stagingBuffer, + destinationImage, + IImage::LAYOUT::GENERAL, + tilesPerFrame, + regionsPerFrame[cmdBufIndex].data() + ); + + IGPUCommandBuffer::SImageMemoryBarrier afterBarrier = {}; + afterBarrier.oldLayout = IImage::LAYOUT::GENERAL; + afterBarrier.newLayout = IImage::LAYOUT::GENERAL; + afterBarrier.image = destinationImage; + afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + afterBarrier.subresourceRange.baseMipLevel = 0; + afterBarrier.subresourceRange.levelCount = 1; + afterBarrier.subresourceRange.baseArrayLayer = 0; + afterBarrier.subresourceRange.layerCount = 1; + afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} }); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1); + + commandBuffers[cmdBufIndex]->end(); + auto t4 = std::chrono::high_resolution_clock::now(); + + IQueue::SSubmitInfo frameSubmitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() }; + frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 }; + + queue->submit({ &frameSubmitInfo, 1 }); + auto t5 = std::chrono::high_resolution_clock::now(); + + totalWaitTime += std::chrono::duration(t2 - t1).count(); + totalMemcpyTime += std::chrono::duration(t3 - t2).count(); + totalRecordTime += std::chrono::duration(t4 - t3).count(); + totalSubmitTime += std::chrono::duration(t5 - t4).count(); + } + + // End marker is after last submit, NOT after GPU finishes. + auto endTime = std::chrono::high_resolution_clock::now(); + + ISemaphore::SWaitInfo finalWait = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &finalWait, 1 }); + + // Read timestamps from the last completed flight of command buffers + std::vector timestamps(framesInFlight * 2); + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); + uint64_t totalGpuTicks = 0; + for (uint32_t i = 0; i < framesInFlight; i++) { + uint64_t startTick = timestamps[i * 2 + 0]; + uint64_t endTick = timestamps[i * 2 + 1]; + totalGpuTicks += (endTick - startTick); + } + float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; + double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + + double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; + double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; + + + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); + uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0); + + double wallThroughputGBps = totalGB / elapsedSeconds; + double gpuThroughputGBps = totalGB / totalGpuTimeSeconds; + + m_logger->log(" GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps); + m_logger->log(" GPU only throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps); + + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + double memcpyGBps = totalGB / totalMemcpyTime; + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps); + + return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps }; + } + + + double runBenchmarkImageStaging( + const char* strategyName, + const std::vector>& stagingImages, + const std::vector& imageMemoryOffsets, + IDeviceMemoryAllocation* stagingMemory, + void* mappedPtr, + IGPUImage* destinationImage, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + // Disabled after testing: this path needs CPU writes into host-visible + // OPTIMAL images, but the memory layout and preinitialized-image lifetime + // rules are too implementation-dependent to make this a clean benchmark. + return 0.0; + } + + BenchResult runBenchmarkCompute( + const char* strategyName, + IGPUBuffer* stagingBuffer, + IDeviceMemoryAllocator::SAllocation& stagingAlloc, + void* mappedPtr, + IGPUImage* destinationImage, + IGPUComputePipeline* pipeline, + IGPUPipelineLayout* pipelineLayout, + IGPUDescriptorSet* ds, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t workgroupSizeX, + uint32_t workgroupSizeY, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); + + smart_refctd_ptr queryPool; + { + IQueryPool::SCreationParams queryPoolParams = {}; + queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + queryPool = m_device->createQueryPool(queryPoolParams); + } + + std::vector> commandPools(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i] = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + } + std::vector> commandBuffers(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i]->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, + 1, + &commandBuffers[i] + ); + } + + uint64_t timelineValue = 0; + + commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = destinationImage; + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} }); + } + commandBuffers[0]->end(); + + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + queue->submit({ &submitInfo, 1 }); + + ISemaphore::SWaitInfo waitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + uint32_t tilesPerRow = imageWidth / tileSize; + uint32_t tileRows = (tilesPerFrame + tilesPerRow - 1u) / tilesPerRow; + uint32_t dispatchX = (tilesPerRow * tileSize + workgroupSizeX - 1u) / workgroupSizeX; + uint32_t dispatchY = (tileRows * tileSize + workgroupSizeY - 1u) / workgroupSizeY; + uint32_t partitionSize = tilesPerFrame * tileSizeBytes; + + std::vector cpuSourceData(partitionSize); + { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + uint32_t* data = reinterpret_cast(cpuSourceData.data()); + for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) + data[i] = g(); + } + + const uint32_t dstTileLocationsBufferSize = framesInFlight * tilesPerFrame * sizeof(uint32_t); + std::vector dstTileLocations(framesInFlight * tilesPerFrame); + for (uint32_t flight = 0; flight < framesInFlight; flight++) + { + for (uint32_t i = 0; i < tilesPerFrame; i++) + { + uint32_t dstTileX = i % tilesPerRow; + uint32_t dstTileY = i / tilesPerRow; + dstTileLocations[flight * tilesPerFrame + i] = packDstTileLocation(dstTileX, dstTileY); + } + } + + smart_refctd_ptr dstTileLocationsBuffer; + IDeviceMemoryAllocator::SAllocation dstTileLocationsAlloc; + void* dstTileLocationsMappedPtr = nullptr; + const uint32_t dstTileLocationsMemoryTypeBits = + m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(stagingAlloc.memory->getMemoryPropertyFlags()); + if (!createStagingBuffer(dstTileLocationsBufferSize, dstTileLocationsMemoryTypeBits, + "Benchmark Dst Tile Locations", dstTileLocationsBuffer, dstTileLocationsAlloc, dstTileLocationsMappedPtr)) + return { 0.0, 0.0, 0.0 }; + + memcpy(dstTileLocationsMappedPtr, dstTileLocations.data(), dstTileLocationsBufferSize); + if (!dstTileLocationsAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(dstTileLocationsAlloc.memory.get(), 0, dstTileLocationsBufferSize); + m_device->flushMappedMemoryRanges(1, &range); + } + + double totalWaitTime = 0.0; + double totalMemcpyTime = 0.0; + double totalRecordTime = 0.0; + double totalSubmitTime = 0.0; + + auto startTime = std::chrono::high_resolution_clock::now(); + + for (uint32_t frame = 0; frame < totalFrames; frame++) + { + uint32_t cmdBufIndex = frame % framesInFlight; + + auto t1 = std::chrono::high_resolution_clock::now(); + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + 1 + }; + m_device->blockForSemaphores({ &frameWaitInfo, 1 }); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + commandPools[cmdBufIndex]->reset(); + + uint32_t bufferOffset = cmdBufIndex * partitionSize; + void* targetPtr = static_cast(mappedPtr) + bufferOffset; + memcpy(targetPtr, cpuSourceData.data(), partitionSize); + + if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize); + m_device->flushMappedMemoryRanges(1, &range); + } + + auto t3 = std::chrono::high_resolution_clock::now(); + + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + uint32_t queryStartIndex = cmdBufIndex * 2; + commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2); + + asset::SMemoryBarrier memBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + }; + + IGPUCommandBuffer::SImageMemoryBarrier dstBarrier = {}; + dstBarrier.oldLayout = IImage::LAYOUT::GENERAL; + dstBarrier.newLayout = IImage::LAYOUT::GENERAL; + dstBarrier.image = destinationImage; + dstBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + dstBarrier.subresourceRange.baseMipLevel = 0; + dstBarrier.subresourceRange.levelCount = 1; + dstBarrier.subresourceRange.baseArrayLayer = 0; + dstBarrier.subresourceRange.layerCount = 1; + dstBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + dstBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + dstBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + dstBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { + .memBarriers = {&memBarrier, 1}, + .imgBarriers = {&dstBarrier, 1} + }); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 0); + + commandBuffers[cmdBufIndex]->bindComputePipeline(pipeline); + const IGPUDescriptorSet* sets[] = { ds }; + commandBuffers[cmdBufIndex]->bindDescriptorSets(asset::EPBP_COMPUTE, pipelineLayout, 0, 1, sets); + + uint64_t dstTileLocationsOffset = uint64_t(cmdBufIndex) * tilesPerFrame * sizeof(uint32_t); + SPushConstantData pc = { + .deviceBufferAddress = stagingBuffer->getDeviceAddress() + bufferOffset, + .dstTileLocationsAddress = dstTileLocationsBuffer->getDeviceAddress() + dstTileLocationsOffset, + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = tileSize, + .srcHeight = tileSize, + .tilesPerRow = tilesPerRow + }; + commandBuffers[cmdBufIndex]->pushConstants(pipelineLayout, hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &pc); + commandBuffers[cmdBufIndex]->dispatch(dispatchX, dispatchY, 1u); + + IGPUCommandBuffer::SImageMemoryBarrier afterBarrier = {}; + afterBarrier.oldLayout = IImage::LAYOUT::GENERAL; + afterBarrier.newLayout = IImage::LAYOUT::GENERAL; + afterBarrier.image = destinationImage; + afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + afterBarrier.subresourceRange.baseMipLevel = 0; + afterBarrier.subresourceRange.levelCount = 1; + afterBarrier.subresourceRange.baseArrayLayer = 0; + afterBarrier.subresourceRange.layerCount = 1; + afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} }); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 1); + + commandBuffers[cmdBufIndex]->end(); + auto t4 = std::chrono::high_resolution_clock::now(); + + IQueue::SSubmitInfo frameSubmitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() }; + frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 }; + + queue->submit({ &frameSubmitInfo, 1 }); + auto t5 = std::chrono::high_resolution_clock::now(); + + totalWaitTime += std::chrono::duration(t2 - t1).count(); + totalMemcpyTime += std::chrono::duration(t3 - t2).count(); + totalRecordTime += std::chrono::duration(t4 - t3).count(); + totalSubmitTime += std::chrono::duration(t5 - t4).count(); + } + + // End marker is after last submit, NOT after GPU finishes. + auto endTime = std::chrono::high_resolution_clock::now(); + + ISemaphore::SWaitInfo finalWait = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &finalWait, 1 }); + + // Read timestamps from the last completed flight of command buffers + std::vector timestamps(framesInFlight * 2); + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); + uint64_t totalGpuTicks = 0; + for (uint32_t i = 0; i < framesInFlight; i++) { + uint64_t startTick = timestamps[i * 2 + 0]; + uint64_t endTick = timestamps[i * 2 + 1]; + totalGpuTicks += (endTick - startTick); + } + float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; + double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + + double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; + double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; + + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); + uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0); + + double wallThroughputGBps = totalGB / elapsedSeconds; + double gpuThroughputGBps = totalGB / totalGpuTimeSeconds; + + m_logger->log(" GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps); + m_logger->log(" GPU only throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps); + + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + double memcpyGBps = totalGB / totalMemcpyTime; + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps); + + dstTileLocationsAlloc.memory->unmap(); + + return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps }; + } + + bool createStagingBuffer( + uint32_t bufferSize, + uint32_t memoryTypeBits, + const char* debugName, + smart_refctd_ptr& outBuffer, + IDeviceMemoryAllocator::SAllocation& outAllocation, + void*& outMappedPtr) + { + IGPUBuffer::SCreationParams params; + params.size = bufferSize; + params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + outBuffer = m_device->createBuffer(std::move(params)); + if (!outBuffer) + return logFail("Failed to create GPU buffer of size %d!\n", bufferSize); + + outBuffer->setObjectDebugName(debugName); + + auto reqs = outBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= memoryTypeBits; + + outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + if (!outAllocation.isValid()) + return logFail("Failed to allocate Device Memory!\n"); + + outMappedPtr = outAllocation.memory->map({ 0ull, outAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE); + if (!outMappedPtr) + return logFail("Failed to map Device Memory!\n"); + + return true; + } + + bool verifyComputeShaders() + { + const uint64_t semValue = 2; + + m_cmdPool->reset(); + + memset(m_snakeReadbackMappedPtr, 0, TILE_SIZE_BYTES); + if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->flushMappedMemoryRanges(1, &range); + } + memset(m_mortonReadbackMappedPtr, 0, TILE_SIZE_BYTES); + if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->flushMappedMemoryRanges(1, &range); + } + + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + const IGPUDescriptorSet* sets[] = { m_ds.get() }; + SPushConstantData storePc = { + .deviceBufferAddress = m_stagingBuffer->getDeviceAddress(), + .dstTileLocationsAddress = m_verifyDstTileLocationsBuffer->getDeviceAddress(), + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = TILE_SIZE, + .srcHeight = TILE_SIZE, + .tilesPerRow = 1u + }; + + //SNAKE VERIFICATION + { + IGPUCommandBuffer::SImageMemoryBarrier snakePreBarrier = {}; + snakePreBarrier.oldLayout = IImage::LAYOUT::GENERAL; + snakePreBarrier.newLayout = IImage::LAYOUT::GENERAL; + snakePreBarrier.image = m_destinationImage.get(); + snakePreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + snakePreBarrier.subresourceRange.baseMipLevel = 0; + snakePreBarrier.subresourceRange.levelCount = 1; + snakePreBarrier.subresourceRange.baseArrayLayer = 0; + snakePreBarrier.subresourceRange.layerCount = 1; + snakePreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + snakePreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + snakePreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + snakePreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakePreBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_snakeStorePipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc); + m_cmdbuf->dispatch(1u, TILE_SIZE / 4u, 1u); + + { + IGPUCommandBuffer::SImageMemoryBarrier snakeMidBarrier = {}; + snakeMidBarrier.oldLayout = IImage::LAYOUT::GENERAL; + snakeMidBarrier.newLayout = IImage::LAYOUT::GENERAL; + snakeMidBarrier.image = m_destinationImage.get(); + snakeMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + snakeMidBarrier.subresourceRange.baseMipLevel = 0; + snakeMidBarrier.subresourceRange.levelCount = 1; + snakeMidBarrier.subresourceRange.baseArrayLayer = 0; + snakeMidBarrier.subresourceRange.layerCount = 1; + snakeMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + snakeMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + snakeMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + snakeMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakeMidBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_snakeLoadPipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + + SPushConstantData snakeLoadPc = { + .deviceBufferAddress = m_snakeReadbackBuffer->getDeviceAddress(), + .dstTileLocationsAddress = m_verifyDstTileLocationsBuffer->getDeviceAddress(), + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = TILE_SIZE, + .srcHeight = TILE_SIZE, + .tilesPerRow = 1u + }; + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &snakeLoadPc); + m_cmdbuf->dispatch(1u, TILE_SIZE / 4u, 1u); + + { + asset::SMemoryBarrier memBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT + }; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + //MORTON VERIFICATION + { + IGPUCommandBuffer::SImageMemoryBarrier mortonPreBarrier = {}; + mortonPreBarrier.oldLayout = IImage::LAYOUT::GENERAL; + mortonPreBarrier.newLayout = IImage::LAYOUT::GENERAL; + mortonPreBarrier.image = m_destinationImage.get(); + mortonPreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + mortonPreBarrier.subresourceRange.baseMipLevel = 0; + mortonPreBarrier.subresourceRange.levelCount = 1; + mortonPreBarrier.subresourceRange.baseArrayLayer = 0; + mortonPreBarrier.subresourceRange.layerCount = 1; + mortonPreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + mortonPreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + mortonPreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + mortonPreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonPreBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_mortonStorePipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc); + m_cmdbuf->dispatch(TILE_SIZE / 16u, TILE_SIZE / 16u, 1u); + + { + IGPUCommandBuffer::SImageMemoryBarrier mortonMidBarrier = {}; + mortonMidBarrier.oldLayout = IImage::LAYOUT::GENERAL; + mortonMidBarrier.newLayout = IImage::LAYOUT::GENERAL; + mortonMidBarrier.image = m_destinationImage.get(); + mortonMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + mortonMidBarrier.subresourceRange.baseMipLevel = 0; + mortonMidBarrier.subresourceRange.levelCount = 1; + mortonMidBarrier.subresourceRange.baseArrayLayer = 0; + mortonMidBarrier.subresourceRange.layerCount = 1; + mortonMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + mortonMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + mortonMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + mortonMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonMidBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_mortonLoadPipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + + SPushConstantData mortonLoadPc = { + .deviceBufferAddress = m_mortonReadbackBuffer->getDeviceAddress(), + .dstTileLocationsAddress = m_verifyDstTileLocationsBuffer->getDeviceAddress(), + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = TILE_SIZE, + .srcHeight = TILE_SIZE, + .tilesPerRow = 1u + }; + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &mortonLoadPc); + m_cmdbuf->dispatch(TILE_SIZE / 16u, TILE_SIZE / 16u, 1u); + + { + asset::SMemoryBarrier memBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT + }; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + m_cmdbuf->end(); + + // Submit and wait + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = m_sem.get(), + .value = semValue, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + m_api->startCapture(); + m_queue->submit({ &submitInfo, 1 }); + m_api->endCapture(); + + ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = semValue }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + + if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->invalidateMappedMemoryRanges(1, &range); + } + if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->invalidateMappedMemoryRanges(1, &range); + } + + const uint32_t* srcPixels = static_cast(m_stagingMappedPtr); + uint32_t totalPixels = TILE_SIZE * TILE_SIZE; + + const uint32_t* snakeDstPixels = static_cast(m_snakeReadbackMappedPtr); + uint32_t snakeMatchCount = 0; + uint32_t snakeFirstMismatchIdx = ~0u; + + for (uint32_t i = 0; i < totalPixels; i++) + { + if (srcPixels[i] == snakeDstPixels[i]) + snakeMatchCount++; + else if (snakeFirstMismatchIdx == ~0u) + snakeFirstMismatchIdx = i; + } + + if (snakeMatchCount == totalPixels) + { + m_logger->log("Snake verification PASS. All %u pixels match.", ILogger::ELL_PERFORMANCE, totalPixels); + } + else + { + m_logger->log("Snake verification FAIL. %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X", + ILogger::ELL_ERROR, snakeMatchCount, totalPixels, snakeFirstMismatchIdx, srcPixels[snakeFirstMismatchIdx], snakeDstPixels[snakeFirstMismatchIdx]); + return false; + } + + const uint32_t* mortonDstPixels = static_cast(m_mortonReadbackMappedPtr); + uint32_t mortonMatchCount = 0; + uint32_t mortonFirstMismatchIdx = ~0u; + + for (uint32_t i = 0; i < totalPixels; i++) + { + if (srcPixels[i] == mortonDstPixels[i]) + mortonMatchCount++; + else if (mortonFirstMismatchIdx == ~0u) + mortonFirstMismatchIdx = i; + } + + if (mortonMatchCount == totalPixels) + { + m_logger->log("Morton verification PASS. All %u pixels match.", ILogger::ELL_PERFORMANCE, totalPixels); + } + else + { + m_logger->log("Morton verification FAIL. %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X", + ILogger::ELL_ERROR, mortonMatchCount, totalPixels, mortonFirstMismatchIdx, srcPixels[mortonFirstMismatchIdx], mortonDstPixels[mortonFirstMismatchIdx]); + return false; + } + + return true; + } +}; + +NBL_MAIN_FUNC(ImageUploadBenchmarkApp) diff --git a/73_ImageUploadBenchmark/pipeline.groovy b/73_ImageUploadBenchmark/pipeline.groovy new file mode 100644 index 000000000..1249f10b5 --- /dev/null +++ b/73_ImageUploadBenchmark/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CImageUploadBenchmark extends IBuilder +{ + public CImageUploadBenchmark(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index cbe482aa4..2d4ed7408 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) add_subdirectory(72_CooperativeBinarySearch) + add_subdirectory(73_ImageUploadBenchmark) # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS)