diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt
new file mode 100644
index 000000000..da95550e7
--- /dev/null
+++ b/73_ImageUploadBenchmark/CMakeLists.txt
@@ -0,0 +1,64 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+	file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+	foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+		LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+	endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+	app_resources/common.hlsl
+	app_resources/tile_upload.comp.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(SM 6_8)
+set(JSON [=[
+[
+	{
+		"INPUT": "app_resources/tile_upload.comp.hlsl",
+		"KEY": "snakeStore"
+	}
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+	TARGET ${EXECUTABLE_NAME}SPIRV
+	LINK_TO ${EXECUTABLE_NAME}
+	DEPENDS ${DEPENDS}
+	BINARY_DIR ${OUTPUT_DIRECTORY}
+	MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+	COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM}
+	OUTPUT_VAR KEYS
+	INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+	NAMESPACE nbl::this_example::builtin::build
+	INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+	NAMESPACE nbl::this_example::builtin::build
+	TARGET ${EXECUTABLE_NAME}_builtinsBuild
+	LINK_TO ${EXECUTABLE_NAME}
+	BIND ${OUTPUT_DIRECTORY}
+	BUILTINS ${KEYS}
+)
diff --git a/73_ImageUploadBenchmark/app_resources/common.hlsl b/73_ImageUploadBenchmark/app_resources/common.hlsl
new file mode 100644
index 000000000..70155b2aa
--- /dev/null
+++ b/73_ImageUploadBenchmark/app_resources/common.hlsl
@@ -0,0 +1,11 @@
+#include <nbl/builtin/hlsl/morton.hlsl>
+
+struct PushConstantData
+{
+    uint64_t deviceBufferAddress;
+    uint64_t dstTileLocationsAddress;
+    uint32_t2 dstOffset;
+    uint32_t srcWidth;
+    uint32_t srcHeight;
+    uint32_t tilesPerRow;
+};
diff --git a/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
new file mode 100644
index 000000000..2237e1197
--- /dev/null
+++ b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
@@ -0,0 +1,99 @@
+#include "common.hlsl"
+
+[[vk::binding(0,0)]] RWTexture2D<float32_t4> dstImage;
+[[vk::push_constant]] PushConstantData pc;
+
+using namespace nbl::hlsl;
+
+static const uint32_t TILE_SIZE = 128u;
+static const uint32_t TILE_SIZE_LOG2 = 7u;
+static const uint32_t TILE_SIZE_MASK = TILE_SIZE - 1u;
+static const uint32_t TILE_PIXELS_LOG2 = TILE_SIZE_LOG2 * 2u;
+static const uint32_t BLOCK_SIZE = 16u;
+static const uint32_t BLOCK_SIZE_LOG2 = 4u;
+static const uint32_t BLOCK_PIXELS_LOG2 = BLOCK_SIZE_LOG2 * 2u;
+static const uint32_t BLOCKS_PER_TILE_LOG2 = TILE_SIZE_LOG2 - BLOCK_SIZE_LOG2;
+static const uint32_t BLOCKS_PER_TILE = TILE_SIZE / BLOCK_SIZE;
+
+[numthreads(128, 4, 1)]
+[shader("compute")]
+void SnakeStore(uint32_t3 ID : SV_DispatchThreadID)
+{
+    const uint32_t2 globalPos = ID.xy;
+    const uint32_t2 tileCoord = globalPos >> TILE_SIZE_LOG2;
+    const uint32_t2 localPos = globalPos & TILE_SIZE_MASK;
+    const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
+    const uint32_t localLinearIdx = (localPos.y << TILE_SIZE_LOG2) + localPos.x;
+    const uint32_t srcPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + localLinearIdx;
+    const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
+    const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);
+    const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + localPos;
+
+    const uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + srcPixelIdx * 4u);
+
+    dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed));
+}
+
+[numthreads(128, 4, 1)]
+[shader("compute")]
+void SnakeLoad(uint32_t3 ID : SV_DispatchThreadID)
+{
+    const uint32_t2 globalPos = ID.xy;
+    const uint32_t2 tileCoord = globalPos >> TILE_SIZE_LOG2;
+    const uint32_t2 localPos = globalPos & TILE_SIZE_MASK;
+    const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
+    const uint32_t localLinearIdx = (localPos.y << TILE_SIZE_LOG2) + localPos.x;
+    const uint32_t dstPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + localLinearIdx;
+    const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
+    const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);
+    const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + localPos;
+
+    vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + dstPixelIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos])));
+}
+
+[numthreads(16, 16, 1)]
+[shader("compute")]
+void MortonStore(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
+{
+    const uint32_t2 globalBlock = GroupID.xy;
+    const uint32_t2 threadPos = ID.xy;
+    const uint32_t2 tileCoord = globalBlock >> BLOCKS_PER_TILE_LOG2;
+    const uint32_t2 blockCoordInTile = globalBlock & (BLOCKS_PER_TILE - 1u);
+    const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
+    const uint32_t blockIdxInTile = (blockCoordInTile.y << BLOCKS_PER_TILE_LOG2) + blockCoordInTile.x;
+    const uint32_t localLinearIdx = (threadPos.y << BLOCK_SIZE_LOG2) + threadPos.x;
+    const uint32_t srcPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + (blockIdxInTile << BLOCK_PIXELS_LOG2) + localLinearIdx;
+    const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
+    const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);
+
+    morton::code<false, 4, 2> mc;
+    mc.value = uint16_t(localLinearIdx);
+    const uint32_t2 mortonLocalPos = _static_cast<uint32_t2>(mc);
+    const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + (blockCoordInTile << BLOCK_SIZE_LOG2) + mortonLocalPos;
+
+    const uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + srcPixelIdx * 4u);
+    dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed));
+}
+
+[numthreads(16, 16, 1)]
+[shader("compute")]
+void MortonLoad(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
+{
+    const uint32_t2 globalBlock = GroupID.xy;
+    const uint32_t2 threadPos = ID.xy;
+    const uint32_t2 tileCoord = globalBlock >> BLOCKS_PER_TILE_LOG2;
+    const uint32_t2 blockCoordInTile = globalBlock & (BLOCKS_PER_TILE - 1u);
+    const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
+    const uint32_t blockIdxInTile = (blockCoordInTile.y << BLOCKS_PER_TILE_LOG2) + blockCoordInTile.x;
+    const uint32_t localLinearIdx = (threadPos.y << BLOCK_SIZE_LOG2) + threadPos.x;
+    const uint32_t dstPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + (blockIdxInTile << BLOCK_PIXELS_LOG2) + localLinearIdx;
+    const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
+    const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);
+
+    morton::code<false, 4, 2> mc;
+    mc.value = uint16_t(localLinearIdx);
+    const uint32_t2 mortonLocalPos = _static_cast<uint32_t2>(mc);
+    const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + (blockCoordInTile << BLOCK_SIZE_LOG2) + mortonLocalPos;
+
+    vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + dstPixelIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos])));
+}
diff --git a/73_ImageUploadBenchmark/config.json.template b/73_ImageUploadBenchmark/config.json.template
new file mode 100644
index 000000000..12215d0bb
--- /dev/null
+++ b/73_ImageUploadBenchmark/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
new file mode 100644
index 000000000..4910cfbbe
--- /dev/null
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -0,0 +1,1368 @@
+#include "nbl/examples/examples.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+#include <chrono>
+#include <thread>
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+{
+	using device_base_t = application_templates::MonoDeviceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
+public:
+	ImageUploadBenchmarkApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
+		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
+		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
+		constexpr uint32_t TOTAL_FRAMES = 1000;
+
+		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_PERFORMANCE);
+		m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_PERFORMANCE, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024);
+		m_logger->log("Staging buffer: %u MB", ILogger::ELL_PERFORMANCE, STAGING_BUFFER_SIZE / (1024 * 1024));
+		m_logger->log("Tiles per frame: %u", ILogger::ELL_PERFORMANCE, TILES_PER_FRAME);
+		m_logger->log("Frames in flight: %u", ILogger::ELL_PERFORMANCE, FRAMES_IN_FLIGHT);
+
+		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
+		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
+		uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT);
+
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits;
+
+		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits & ~hostCachedBits;
+
+		m_logger->log("Memory type bits HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X",
+			ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits);
+		m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X",
+			ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
+
+		if (!hostVisibleOnlyBits)
+		{
+			m_logger->log("HOST_VISIBLE non-cached memory types not found!", ILogger::ELL_ERROR);
+			return false;
+		}
+
+		if (!deviceLocalBits)
+		{
+			m_logger->log("DEVICE_LOCAL memory types not found!", ILogger::ELL_ERROR);
+			return false;
+		}
+
+		m_queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT);
+		{
+			IGPUImage::SCreationParams imgParams{};
+			imgParams.type = IImage::E_TYPE::ET_2D;
+			uint32_t tilePerRow = (uint32_t)std::sqrt(TILES_PER_FRAME);
+			imgParams.extent.width = TILE_SIZE * tilePerRow;
+			imgParams.extent.height = TILE_SIZE * tilePerRow;
+			imgParams.extent.depth = 1u;
+			imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM;
+			imgParams.mipLevels = 1u;
+			imgParams.flags = IImage::ECF_NONE;
+			imgParams.arrayLayers = 1u;
+			imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
+			imgParams.tiling = video::IGPUImage::TILING::OPTIMAL;
+			imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_STORAGE_BIT;
+			imgParams.preinitialized = false;
+
+			m_destinationImage = m_device->createImage(std::move(imgParams));
+			if (!m_destinationImage)
+				return logFail("Failed to create destination image!\n");
+
+			m_destinationImage->setObjectDebugName("Destination Image");
+
+			auto reqs = m_destinationImage->getMemoryReqs();
+			reqs.memoryTypeBits &= deviceLocalBits;
+
+			auto allocation = m_device->allocate(reqs, m_destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE);
+			if (!allocation.isValid())
+				return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n");
+		}
+
+		//compute shader
+		auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>()->smart_refctd_ptr<IShader>
+		{
+			IAssetLoader::SAssetLoadParams lp = {};
+			lp.logger = m_logger.get();
+			lp.workingDirectory = "app_resources";
+
+			auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_physicalDevice->getLimits(), m_physicalDevice->getFeatures());
+			m_logger->log("Loading shader with key: %s", ILogger::ELL_INFO, key.data());
+
+			auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
+			const auto assets = assetBundle.getContents();
+			if (assets.empty())
+			{
+				m_logger->log("Asset bundle is empty for key: %s", ILogger::ELL_ERROR, key.data());
+				return smart_refctd_ptr<IShader>(nullptr);
+			}
+
+			m_logger->log("Asset count: %u, asset type: %u", ILogger::ELL_INFO, assets.size(), (uint32_t)assets[0]->getAssetType());
+
+			auto shader = IAsset::castDown<IShader>(assets[0]);
+			return shader;
+		};
+
+
+		//Setup compute shader resources
+		m_logger->log("\n=== Setting up Compute Shaders (Snake + Morton) ===", ILogger::ELL_PERFORMANCE);
+		{
+			auto shaderLib = loadPrecompiledShader.operator() < "snakeStore" > ();
+			if (!shaderLib)
+				return logFail("Failed to load shader library!\n");
+
+			IGPUDescriptorSetLayout::SBinding dsBinding = {
+				.binding = 0,
+				.type = IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+				.count = 1
+			};
+			auto dsLayout = m_device->createDescriptorSetLayout({ &dsBinding, 1 });
+			if (!dsLayout)
+				return logFail("Failed to create descriptor set layout!\n");
+
+			asset::SPushConstantRange pcRange = {
+				.stageFlags = hlsl::ShaderStage::ESS_COMPUTE,
+				.offset = 0,
+				.size = sizeof(SPushConstantData)
+			};
+
+			m_pipelineLayout = m_device->createPipelineLayout({ &pcRange, 1 }, smart_refctd_ptr(dsLayout));
+			if (!m_pipelineLayout)
+				return logFail("Failed to create pipeline layout!\n");
+
+			auto createPipeline = [&](const char* entryPoint, smart_refctd_ptr<IGPUComputePipeline>& outPipeline) -> bool
+				{
+					IGPUComputePipeline::SCreationParams params = {};
+					params.layout = m_pipelineLayout.get();
+					params.shader.shader = shaderLib.get();
+					params.shader.entryPoint = entryPoint;
+					if (!m_device->createComputePipelines(nullptr, { &params, 1 }, &outPipeline))
+						return logFail("Failed to create %s pipeline!\n", entryPoint);
+					return true;
+				};
+
+			if (!createPipeline("SnakeStore",  m_snakeStorePipeline))  return false;
+			if (!createPipeline("SnakeLoad",   m_snakeLoadPipeline))   return false;
+			if (!createPipeline("MortonStore", m_mortonStorePipeline)) return false;
+			if (!createPipeline("MortonLoad",  m_mortonLoadPipeline))  return false;
+
+			auto imageView = m_device->createImageView({
+				.flags = IGPUImageView::ECF_NONE,
+				.subUsages = IGPUImage::EUF_STORAGE_BIT,
+				.image = smart_refctd_ptr(m_destinationImage),
+				.viewType = IGPUImageView::E_TYPE::ET_2D,
+				.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM
+				});
+			if (!imageView)
+				return logFail("Failed to create image view!\n");
+
+			uint32_t setCount = 1;
+			auto dsPool = m_device->createDescriptorPoolForDSLayouts(
+				IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }, &setCount);
+			m_ds = dsPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
+
+			IGPUDescriptorSet::SDescriptorInfo imgInfo = {};
+			imgInfo.desc = imageView;
+			imgInfo.info.image.imageLayout = IGPUImage::LAYOUT::GENERAL;
+
+			IGPUDescriptorSet::SWriteDescriptorSet dsWrite = {
+				.dstSet = m_ds.get(),
+				.binding = 0,
+				.arrayElement = 0,
+				.count = 1,
+				.info = &imgInfo
+			};
+			m_device->updateDescriptorSets({ &dsWrite, 1 }, {});
+
+			if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits,
+				"Verify Staging Buffer", m_stagingBuffer, m_stagingAlloc, m_stagingMappedPtr))
+				return false;
+
+			if (!createStagingBuffer(sizeof(uint32_t), hostVisibleOnlyBits,
+				"Verify Dst Tile Locations", m_verifyDstTileLocationsBuffer, m_verifyDstTileLocationsAlloc, m_verifyDstTileLocationsMappedPtr))
+				return false;
+
+			if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits,
+				"Snake Readback Buffer", m_snakeReadbackBuffer, m_snakeReadbackAlloc, m_snakeReadbackMappedPtr))
+				return false;
+
+			if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits,
+				"Morton Readback Buffer", m_mortonReadbackBuffer, m_mortonReadbackAlloc, m_mortonReadbackMappedPtr))
+				return false;
+
+			{
+				uint32_t* pixels = static_cast<uint32_t*>(m_stagingMappedPtr);
+				uint32_t totalPixels = TILE_SIZE * TILE_SIZE;
+				for (uint32_t i = 0; i < totalPixels; i++)
+				{
+					uint8_t val = static_cast<uint8_t>(i & 0xFF);
+					pixels[i] = val | (val << 8u) | (val << 16u) | (val << 24u);
+				}
+
+				if (!m_stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				{
+					ILogicalDevice::MappedMemoryRange range(m_stagingAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+					m_device->flushMappedMemoryRanges(1, &range);
+				}
+			}
+			{
+				uint32_t* dstTileLocation = static_cast<uint32_t*>(m_verifyDstTileLocationsMappedPtr);
+				*dstTileLocation = packDstTileLocation(0u, 0u);
+
+				if (!m_verifyDstTileLocationsAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				{
+					ILogicalDevice::MappedMemoryRange range(m_verifyDstTileLocationsAlloc.memory.get(), 0, sizeof(uint32_t));
+					m_device->flushMappedMemoryRanges(1, &range);
+				}
+			}
+
+			m_cmdPool = m_device->createCommandPool(
+				m_queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+			m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &m_cmdbuf);
+			m_sem = m_device->createSemaphore(0);
+
+			m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			{
+				IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+				initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+				initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+				initBarrier.image = m_destinationImage.get();
+				initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+				initBarrier.subresourceRange.baseMipLevel = 0;
+				initBarrier.subresourceRange.levelCount = 1;
+				initBarrier.subresourceRange.baseArrayLayer = 0;
+				initBarrier.subresourceRange.layerCount = 1;
+				initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+				initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+				initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+				initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+				m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} });
+			}
+			m_cmdbuf->end();
+
+			IQueue::SSubmitInfo submitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() };
+			submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+			IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+				.semaphore = m_sem.get(),
+				.value = 1,
+				.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+			submitInfo.signalSemaphores = { &signalInfo, 1 };
+
+			m_queue->submit({ &submitInfo, 1 });
+
+			ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = 1 };
+			m_device->blockForSemaphores({ &waitInfo, 1 });
+		}
+
+		m_logger->log("Setup complete. Running benchmarks.", ILogger::ELL_PERFORMANCE);
+
+		runAllBenchmarks();
+
+		if (!verifyComputeShaders())
+			return false;
+
+		return true;
+	}
+
+	bool keepRunning() override { return false; }
+
+	void workLoopBody() override {}
+
+	bool onAppTerminated() override
+	{
+
+		m_logger->log("\nResults above. Waiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE);
+		std::this_thread::sleep_for(std::chrono::seconds(5));
+
+		if (m_stagingAlloc.memory)
+			m_stagingAlloc.memory->unmap();
+		if (m_verifyDstTileLocationsAlloc.memory)
+			m_verifyDstTileLocationsAlloc.memory->unmap();
+		if (m_snakeReadbackAlloc.memory)
+			m_snakeReadbackAlloc.memory->unmap();
+		if (m_mortonReadbackAlloc.memory)
+			m_mortonReadbackAlloc.memory->unmap();
+		return true;
+	}
+
+protected:
+	core::vector<queue_req_t> getQueueRequirements() const override
+	{
+		using flags_t = IQueue::FAMILY_FLAGS;
+		return { {
+			.requiredFlags = flags_t::GRAPHICS_BIT,
+			.disallowedFlags = flags_t::NONE,
+			.queueCount = 1,
+			.maxImageTransferGranularity = {1, 1, 1}
+		} };
+	}
+
+private:
+	static constexpr uint32_t TILE_SIZE = 128;
+	static constexpr uint32_t TILE_BYTES_PER_PIXEL = 4;
+	static constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL;
+
+	struct SPushConstantData
+	{
+		uint64_t deviceBufferAddress;
+		uint64_t dstTileLocationsAddress;
+		uint32_t dstOffsetX;
+		uint32_t dstOffsetY;
+		uint32_t srcWidth;
+		uint32_t srcHeight;
+		uint32_t tilesPerRow;
+	};
+
+	IQueue* m_queue = nullptr;
+	smart_refctd_ptr<IGPUImage> m_destinationImage;
+	smart_refctd_ptr<IGPUComputePipeline> m_snakeStorePipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_snakeLoadPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_mortonStorePipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_mortonLoadPipeline;
+	smart_refctd_ptr<IGPUPipelineLayout> m_pipelineLayout;
+	smart_refctd_ptr<IGPUDescriptorSet> m_ds;
+	smart_refctd_ptr<IGPUBuffer> m_stagingBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_verifyDstTileLocationsBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_snakeReadbackBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_mortonReadbackBuffer;
+	IDeviceMemoryAllocator::SAllocation m_stagingAlloc;
+	IDeviceMemoryAllocator::SAllocation m_verifyDstTileLocationsAlloc;
+	IDeviceMemoryAllocator::SAllocation m_snakeReadbackAlloc;
+	IDeviceMemoryAllocator::SAllocation m_mortonReadbackAlloc;
+	void* m_stagingMappedPtr = nullptr;
+	void* m_verifyDstTileLocationsMappedPtr = nullptr;
+	void* m_snakeReadbackMappedPtr = nullptr;
+	void* m_mortonReadbackMappedPtr = nullptr;
+	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
+	smart_refctd_ptr<IGPUCommandBuffer> m_cmdbuf;
+	smart_refctd_ptr<ISemaphore> m_sem;
+
+	void runAllBenchmarks()
+	{
+		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
+		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
+		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
+		constexpr uint32_t TOTAL_FRAMES = 1000;
+
+		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
+		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
+		uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT);
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits;
+		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits & ~hostCachedBits;
+
+		m_logger->log("\n=== RUNNING BENCHMARKS ===", ILogger::ELL_PERFORMANCE);
+
+		struct BenchmarkResult
+		{
+			const char* name;
+			double wallGBps;
+			double gpuGBps;
+			double memcpyGBps;
+		};
+		std::vector<BenchmarkResult> results;
+
+		//SysRAM benchmarks
+		{
+			smart_refctd_ptr<IGPUBuffer> benchStagingBuffer;
+			IDeviceMemoryAllocator::SAllocation benchStagingAlloc;
+			void* benchMappedPtr = nullptr;
+			uint32_t benchBufSize = STAGING_BUFFER_SIZE;
+
+			if (createStagingBuffer(benchBufSize, hostVisibleOnlyBits,
+				"Benchmark Staging (SysRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr))
+			{
+				m_logger->log("\n--- CopyBufferToImage (SysRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rCopy = runBenchmarkBufferToImageCopyCommand("CopyBufferToImage (SysRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({ "CopyBufferToImage (SysRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps });
+
+				m_logger->log("\n--- Snake Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rSnake = runBenchmarkCompute("Snake Compute (SysRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_snakeStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES, 128u, 4u,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({ "Snake Compute (SysRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps });
+
+				m_logger->log("\n--- Morton Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rMorton = runBenchmarkCompute("Morton Compute (SysRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_mortonStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES, 16u, 16u,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({ "Morton Compute (SysRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps });
+
+				benchStagingAlloc.memory->unmap();
+			}
+		}
+
+		//BAR/VRAM benchmarks (if available)
+		if (hostVisibleDeviceLocalBits)
+		{
+			smart_refctd_ptr<IGPUBuffer> benchStagingBuffer;
+			IDeviceMemoryAllocator::SAllocation benchStagingAlloc;
+			void* benchMappedPtr = nullptr;
+			uint32_t benchBufSize = STAGING_BUFFER_SIZE;
+
+			if (createStagingBuffer(benchBufSize, hostVisibleDeviceLocalBits,
+				"Benchmark Staging (BAR/VRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr))
+			{
+				m_logger->log("\n--- CopyBufferToImage (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rCopy = runBenchmarkBufferToImageCopyCommand("CopyBufferToImage (BAR/VRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({ "CopyBufferToImage (BAR/VRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps });
+
+				m_logger->log("\n--- Snake Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rSnake = runBenchmarkCompute("Snake Compute (BAR/VRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_snakeStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES, 128u, 4u,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({ "Snake Compute (BAR/VRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps });
+
+				m_logger->log("\n--- Morton Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rMorton = runBenchmarkCompute("Morton Compute (BAR/VRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_mortonStorePipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES, 16u, 16u,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({ "Morton Compute (BAR/VRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps });
+
+				benchStagingAlloc.memory->unmap();
+			}
+		}
+
+		//Summary table
+		m_logger->log("\n=== BENCHMARK RESULTS ===", ILogger::ELL_PERFORMANCE);
+		m_logger->log("%-36s | Wall GB/s | GPU GB/s | Memcpy GB/s", ILogger::ELL_PERFORMANCE, "Strategy");
+		m_logger->log("-------------------------------------+-----------+----------+------------", ILogger::ELL_PERFORMANCE);
+		for (const auto& r : results)
+		{
+			m_logger->log("%-36s | %9.2f | %8.2f | %10.2f", ILogger::ELL_PERFORMANCE, r.name, r.wallGBps, r.gpuGBps, r.memcpyGBps);
+		}
+		m_logger->log("=====================================+===========+==========+============", ILogger::ELL_PERFORMANCE);
+	}
+
+	struct BenchResult
+	{
+		double wallGBps;
+		double gpuGBps;
+		double memcpyGBps;
+	};
+
+	static uint32_t packDstTileLocation(uint32_t tileX, uint32_t tileY)
+	{
+		return (tileX & 0xffffu) | ((tileY & 0xffffu) << 16u);
+	}
+
+	void generateTileCopyRegions(
+		IImage::SBufferCopy* outRegions,
+		uint32_t tilesPerFrame,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t imageWidth,
+		uint32_t bufferBaseOffset)
+	{
+		uint32_t tilesPerRow = imageWidth / tileSize;
+		for (size_t i = 0; i < tilesPerFrame; i++)
+		{
+			uint32_t tileX = (i % tilesPerRow) * tileSize;
+			uint32_t tileY = (i / tilesPerRow) * tileSize;
+
+			outRegions[i].bufferOffset = bufferBaseOffset + (i * tileSizeBytes);
+			outRegions[i].bufferRowLength = tileSize;
+			outRegions[i].bufferImageHeight = tileSize;
+			outRegions[i].imageOffset = { tileX, tileY, 0 };
+			outRegions[i].imageExtent = { tileSize, tileSize, 1 };
+			outRegions[i].imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			outRegions[i].imageSubresource.mipLevel = 0;
+			outRegions[i].imageSubresource.baseArrayLayer = 0;
+			outRegions[i].imageSubresource.layerCount = 1;
+		}
+	}
+
+	BenchResult runBenchmarkBufferToImageCopyCommand(
+		const char* strategyName,
+		IGPUBuffer* stagingBuffer,
+		IDeviceMemoryAllocator::SAllocation& stagingAlloc,
+		void* mappedPtr,
+		IGPUImage* destinationImage,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t tilesPerFrame,
+		uint32_t framesInFlight,
+		uint32_t totalFrames,
+		IQueue* queue)
+	{
+		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
+
+		smart_refctd_ptr<IQueryPool> queryPool;
+		{
+			IQueryPool::SCreationParams queryPoolParams = {};
+			queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP;
+			queryPoolParams.queryCount = framesInFlight * 2;
+			queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+			queryPool = m_device->createQueryPool(queryPoolParams);
+		}
+
+		std::vector<smart_refctd_ptr<IGPUCommandPool>> commandPools(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i] = m_device->createCommandPool(
+				queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+		}
+		std::vector<smart_refctd_ptr<IGPUCommandBuffer>> commandBuffers(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i]->createCommandBuffers(
+				IGPUCommandPool::BUFFER_LEVEL::PRIMARY,
+				1,
+				&commandBuffers[i]
+			);
+		}
+
+		uint64_t timelineValue = 0;
+
+		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+			initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+			initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			initBarrier.image = destinationImage;
+			initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			initBarrier.subresourceRange.baseMipLevel = 0;
+			initBarrier.subresourceRange.levelCount = 1;
+			initBarrier.subresourceRange.baseArrayLayer = 0;
+			initBarrier.subresourceRange.layerCount = 1;
+			initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+			initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+			initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} });
+		}
+		commandBuffers[0]->end();
+
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = ++timelineValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
+
+		queue->submit({ &submitInfo, 1 });
+
+		ISemaphore::SWaitInfo waitInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &waitInfo, 1 });
+
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		uint32_t partitionSize = tilesPerFrame * tileSizeBytes;
+
+		std::vector<uint8_t> cpuSourceData(partitionSize);
+		{
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData.data());
+			for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++)
+				data[i] = g();
+		}
+		std::vector<std::vector<IImage::SBufferCopy>> regionsPerFrame(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			regionsPerFrame[i].resize(tilesPerFrame);
+			uint32_t bufferOffset = i * partitionSize;
+			generateTileCopyRegions(regionsPerFrame[i].data(), tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset);
+		}
+
+		double totalWaitTime = 0.0;
+		double totalMemcpyTime = 0.0;
+		double totalRecordTime = 0.0;
+		double totalSubmitTime = 0.0;
+
+		auto startTime = std::chrono::high_resolution_clock::now();
+
+		for (uint32_t frame = 0; frame < totalFrames; frame++)
+		{
+			uint32_t cmdBufIndex = frame % framesInFlight;
+
+			auto t1 = std::chrono::high_resolution_clock::now();
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight + 1
+				};
+				m_device->blockForSemaphores({ &frameWaitInfo, 1 });
+			}
+			auto t2 = std::chrono::high_resolution_clock::now();
+
+			commandPools[cmdBufIndex]->reset();
+
+			uint32_t bufferOffset = cmdBufIndex * partitionSize;
+			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + bufferOffset;
+			memcpy(targetPtr, cpuSourceData.data(), partitionSize);
+
+			if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			{
+				ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+
+			auto t3 = std::chrono::high_resolution_clock::now();
+
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			uint32_t queryStartIndex = cmdBufIndex * 2;
+			commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
+			barrier.oldLayout = IImage::LAYOUT::GENERAL;
+			barrier.newLayout = IImage::LAYOUT::GENERAL;
+			barrier.image = destinationImage;
+			barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			barrier.subresourceRange.baseMipLevel = 0;
+			barrier.subresourceRange.levelCount = 1;
+			barrier.subresourceRange.baseArrayLayer = 0;
+			barrier.subresourceRange.layerCount = 1;
+			barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0);
+
+			commandBuffers[cmdBufIndex]->copyBufferToImage(
+				stagingBuffer,
+				destinationImage,
+				IImage::LAYOUT::GENERAL,
+				tilesPerFrame,
+				regionsPerFrame[cmdBufIndex].data()
+			);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> afterBarrier = {};
+			afterBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.image = destinationImage;
+			afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			afterBarrier.subresourceRange.baseMipLevel = 0;
+			afterBarrier.subresourceRange.levelCount = 1;
+			afterBarrier.subresourceRange.baseArrayLayer = 0;
+			afterBarrier.subresourceRange.layerCount = 1;
+			afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} });
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1);
+
+			commandBuffers[cmdBufIndex]->end();
+			auto t4 = std::chrono::high_resolution_clock::now();
+
+			IQueue::SSubmitInfo frameSubmitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() };
+			frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 };
+
+			IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = {
+				.semaphore = timelineSemaphore.get(),
+				.value = ++timelineValue,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			};
+			frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 };
+
+			queue->submit({ &frameSubmitInfo, 1 });
+			auto t5 = std::chrono::high_resolution_clock::now();
+
+			totalWaitTime += std::chrono::duration<double>(t2 - t1).count();
+			totalMemcpyTime += std::chrono::duration<double>(t3 - t2).count();
+			totalRecordTime += std::chrono::duration<double>(t4 - t3).count();
+			totalSubmitTime += std::chrono::duration<double>(t5 - t4).count();
+		}
+
+		// End marker is after last submit, NOT after GPU finishes.
+		auto endTime = std::chrono::high_resolution_clock::now();
+
+		ISemaphore::SWaitInfo finalWait = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &finalWait, 1 });
+
+		// Read timestamps from the last completed flight of command buffers
+		std::vector<uint64_t> timestamps(framesInFlight * 2);
+		const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+		m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags);
+		uint64_t totalGpuTicks = 0;
+		for (uint32_t i = 0; i < framesInFlight; i++) {
+			uint64_t startTick = timestamps[i * 2 + 0];
+			uint64_t endTick = timestamps[i * 2 + 1];
+			totalGpuTicks += (endTick - startTick);
+		}
+		float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds;
+		double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9;
+
+		double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight;
+		double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames;
+
+
+		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
+		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+		double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0);
+
+		double wallThroughputGBps = totalGB / elapsedSeconds;
+		double gpuThroughputGBps = totalGB / totalGpuTimeSeconds;
+
+		m_logger->log("    GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
+		m_logger->log("    CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps);
+		m_logger->log("    GPU only throughput:   %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps);
+
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName);
+		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		double memcpyGBps = totalGB / totalMemcpyTime;
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps);
+
+		return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps };
+	}
+
+
+	double runBenchmarkImageStaging(
+		const char* strategyName,
+		const std::vector<smart_refctd_ptr<IGPUImage>>& stagingImages,
+		const std::vector<size_t>& imageMemoryOffsets,
+		IDeviceMemoryAllocation* stagingMemory,
+		void* mappedPtr,
+		IGPUImage* destinationImage,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t tilesPerFrame,
+		uint32_t framesInFlight,
+		uint32_t totalFrames,
+		IQueue* queue)
+	{
+		// Disabled after testing: this path needs CPU writes into host-visible
+		// OPTIMAL images, but the memory layout and preinitialized-image lifetime
+		// rules are too implementation-dependent to make this a clean benchmark.
+		return 0.0;
+	}
+
+	BenchResult runBenchmarkCompute(
+		const char* strategyName,
+		IGPUBuffer* stagingBuffer,
+		IDeviceMemoryAllocator::SAllocation& stagingAlloc,
+		void* mappedPtr,
+		IGPUImage* destinationImage,
+		IGPUComputePipeline* pipeline,
+		IGPUPipelineLayout* pipelineLayout,
+		IGPUDescriptorSet* ds,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t workgroupSizeX,
+		uint32_t workgroupSizeY,
+		uint32_t tilesPerFrame,
+		uint32_t framesInFlight,
+		uint32_t totalFrames,
+		IQueue* queue)
+	{
+		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
+
+		smart_refctd_ptr<IQueryPool> queryPool;
+		{
+			IQueryPool::SCreationParams queryPoolParams = {};
+			queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP;
+			queryPoolParams.queryCount = framesInFlight * 2;
+			queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+			queryPool = m_device->createQueryPool(queryPoolParams);
+		}
+
+		std::vector<smart_refctd_ptr<IGPUCommandPool>> commandPools(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i] = m_device->createCommandPool(
+				queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+		}
+		std::vector<smart_refctd_ptr<IGPUCommandBuffer>> commandBuffers(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i]->createCommandBuffers(
+				IGPUCommandPool::BUFFER_LEVEL::PRIMARY,
+				1,
+				&commandBuffers[i]
+			);
+		}
+
+		uint64_t timelineValue = 0;
+
+		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+			initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+			initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			initBarrier.image = destinationImage;
+			initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			initBarrier.subresourceRange.baseMipLevel = 0;
+			initBarrier.subresourceRange.levelCount = 1;
+			initBarrier.subresourceRange.baseArrayLayer = 0;
+			initBarrier.subresourceRange.layerCount = 1;
+			initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+			initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+			initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} });
+		}
+		commandBuffers[0]->end();
+
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = ++timelineValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
+
+		queue->submit({ &submitInfo, 1 });
+
+		ISemaphore::SWaitInfo waitInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &waitInfo, 1 });
+
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		uint32_t tilesPerRow = imageWidth / tileSize;
+		uint32_t tileRows = (tilesPerFrame + tilesPerRow - 1u) / tilesPerRow;
+		uint32_t dispatchX = (tilesPerRow * tileSize + workgroupSizeX - 1u) / workgroupSizeX;
+		uint32_t dispatchY = (tileRows * tileSize + workgroupSizeY - 1u) / workgroupSizeY;
+		uint32_t partitionSize = tilesPerFrame * tileSizeBytes;
+
+		std::vector<uint8_t> cpuSourceData(partitionSize);
+		{
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData.data());
+			for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++)
+				data[i] = g();
+		}
+
+		const uint32_t dstTileLocationsBufferSize = framesInFlight * tilesPerFrame * sizeof(uint32_t);
+		std::vector<uint32_t> dstTileLocations(framesInFlight * tilesPerFrame);
+		for (uint32_t flight = 0; flight < framesInFlight; flight++)
+		{
+			for (uint32_t i = 0; i < tilesPerFrame; i++)
+			{
+				uint32_t dstTileX = i % tilesPerRow;
+				uint32_t dstTileY = i / tilesPerRow;
+				dstTileLocations[flight * tilesPerFrame + i] = packDstTileLocation(dstTileX, dstTileY);
+			}
+		}
+
+		smart_refctd_ptr<IGPUBuffer> dstTileLocationsBuffer;
+		IDeviceMemoryAllocator::SAllocation dstTileLocationsAlloc;
+		void* dstTileLocationsMappedPtr = nullptr;
+		const uint32_t dstTileLocationsMemoryTypeBits =
+			m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(stagingAlloc.memory->getMemoryPropertyFlags());
+		if (!createStagingBuffer(dstTileLocationsBufferSize, dstTileLocationsMemoryTypeBits,
+			"Benchmark Dst Tile Locations", dstTileLocationsBuffer, dstTileLocationsAlloc, dstTileLocationsMappedPtr))
+			return { 0.0, 0.0, 0.0 };
+
+		memcpy(dstTileLocationsMappedPtr, dstTileLocations.data(), dstTileLocationsBufferSize);
+		if (!dstTileLocationsAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(dstTileLocationsAlloc.memory.get(), 0, dstTileLocationsBufferSize);
+			m_device->flushMappedMemoryRanges(1, &range);
+		}
+
+		double totalWaitTime = 0.0;
+		double totalMemcpyTime = 0.0;
+		double totalRecordTime = 0.0;
+		double totalSubmitTime = 0.0;
+
+		auto startTime = std::chrono::high_resolution_clock::now();
+
+		for (uint32_t frame = 0; frame < totalFrames; frame++)
+		{
+			uint32_t cmdBufIndex = frame % framesInFlight;
+
+			auto t1 = std::chrono::high_resolution_clock::now();
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight + 1
+				};
+				m_device->blockForSemaphores({ &frameWaitInfo, 1 });
+			}
+			auto t2 = std::chrono::high_resolution_clock::now();
+
+			commandPools[cmdBufIndex]->reset();
+
+			uint32_t bufferOffset = cmdBufIndex * partitionSize;
+			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + bufferOffset;
+			memcpy(targetPtr, cpuSourceData.data(), partitionSize);
+
+			if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			{
+				ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+
+			auto t3 = std::chrono::high_resolution_clock::now();
+
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			uint32_t queryStartIndex = cmdBufIndex * 2;
+			commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2);
+
+			asset::SMemoryBarrier memBarrier = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+				.srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS
+			};
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> dstBarrier = {};
+			dstBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			dstBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			dstBarrier.image = destinationImage;
+			dstBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			dstBarrier.subresourceRange.baseMipLevel = 0;
+			dstBarrier.subresourceRange.levelCount = 1;
+			dstBarrier.subresourceRange.baseArrayLayer = 0;
+			dstBarrier.subresourceRange.layerCount = 1;
+			dstBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			dstBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			dstBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			dstBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {
+				.memBarriers = {&memBarrier, 1},
+				.imgBarriers = {&dstBarrier, 1}
+				});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 0);
+
+			commandBuffers[cmdBufIndex]->bindComputePipeline(pipeline);
+			const IGPUDescriptorSet* sets[] = { ds };
+			commandBuffers[cmdBufIndex]->bindDescriptorSets(asset::EPBP_COMPUTE, pipelineLayout, 0, 1, sets);
+
+			uint64_t dstTileLocationsOffset = uint64_t(cmdBufIndex) * tilesPerFrame * sizeof(uint32_t);
+			SPushConstantData pc = {
+				.deviceBufferAddress = stagingBuffer->getDeviceAddress() + bufferOffset,
+				.dstTileLocationsAddress = dstTileLocationsBuffer->getDeviceAddress() + dstTileLocationsOffset,
+				.dstOffsetX = 0,
+				.dstOffsetY = 0,
+				.srcWidth = tileSize,
+				.srcHeight = tileSize,
+				.tilesPerRow = tilesPerRow
+			};
+			commandBuffers[cmdBufIndex]->pushConstants(pipelineLayout, hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &pc);
+			commandBuffers[cmdBufIndex]->dispatch(dispatchX, dispatchY, 1u);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> afterBarrier = {};
+			afterBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.image = destinationImage;
+			afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			afterBarrier.subresourceRange.baseMipLevel = 0;
+			afterBarrier.subresourceRange.levelCount = 1;
+			afterBarrier.subresourceRange.baseArrayLayer = 0;
+			afterBarrier.subresourceRange.layerCount = 1;
+			afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} });
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 1);
+
+			commandBuffers[cmdBufIndex]->end();
+			auto t4 = std::chrono::high_resolution_clock::now();
+
+			IQueue::SSubmitInfo frameSubmitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() };
+			frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 };
+
+			IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = {
+				.semaphore = timelineSemaphore.get(),
+				.value = ++timelineValue,
+				.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+			frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 };
+
+			queue->submit({ &frameSubmitInfo, 1 });
+			auto t5 = std::chrono::high_resolution_clock::now();
+
+			totalWaitTime += std::chrono::duration<double>(t2 - t1).count();
+			totalMemcpyTime += std::chrono::duration<double>(t3 - t2).count();
+			totalRecordTime += std::chrono::duration<double>(t4 - t3).count();
+			totalSubmitTime += std::chrono::duration<double>(t5 - t4).count();
+		}
+
+		// End marker is after last submit, NOT after GPU finishes.
+		auto endTime = std::chrono::high_resolution_clock::now();
+
+		ISemaphore::SWaitInfo finalWait = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &finalWait, 1 });
+
+		// Read timestamps from the last completed flight of command buffers
+		std::vector<uint64_t> timestamps(framesInFlight * 2);
+		const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+		m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags);
+		uint64_t totalGpuTicks = 0;
+		for (uint32_t i = 0; i < framesInFlight; i++) {
+			uint64_t startTick = timestamps[i * 2 + 0];
+			uint64_t endTick = timestamps[i * 2 + 1];
+			totalGpuTicks += (endTick - startTick);
+		}
+		float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds;
+		double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9;
+
+		double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight;
+		double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames;
+
+		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
+		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+		double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0);
+
+		double wallThroughputGBps = totalGB / elapsedSeconds;
+		double gpuThroughputGBps = totalGB / totalGpuTimeSeconds;
+
+		m_logger->log("    GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
+		m_logger->log("    CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps);
+		m_logger->log("    GPU only throughput:   %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps);
+
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName);
+		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		double memcpyGBps = totalGB / totalMemcpyTime;
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps);
+
+		dstTileLocationsAlloc.memory->unmap();
+
+		return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps };
+	}
+
+	bool createStagingBuffer(
+		uint32_t bufferSize,
+		uint32_t memoryTypeBits,
+		const char* debugName,
+		smart_refctd_ptr<IGPUBuffer>& outBuffer,
+		IDeviceMemoryAllocator::SAllocation& outAllocation,
+		void*& outMappedPtr)
+	{
+		IGPUBuffer::SCreationParams params;
+		params.size = bufferSize;
+		params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+		outBuffer = m_device->createBuffer(std::move(params));
+		if (!outBuffer)
+			return logFail("Failed to create GPU buffer of size %d!\n", bufferSize);
+
+		outBuffer->setObjectDebugName(debugName);
+
+		auto reqs = outBuffer->getMemoryReqs();
+		reqs.memoryTypeBits &= memoryTypeBits;
+
+		outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+		if (!outAllocation.isValid())
+			return logFail("Failed to allocate Device Memory!\n");
+
+		outMappedPtr = outAllocation.memory->map({ 0ull, outAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE);
+		if (!outMappedPtr)
+			return logFail("Failed to map Device Memory!\n");
+
+		return true;
+	}
+
+	bool verifyComputeShaders()
+	{
+		const uint64_t semValue = 2;
+
+		m_cmdPool->reset();
+
+		memset(m_snakeReadbackMappedPtr, 0, TILE_SIZE_BYTES);
+		if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->flushMappedMemoryRanges(1, &range);
+		}
+		memset(m_mortonReadbackMappedPtr, 0, TILE_SIZE_BYTES);
+		if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->flushMappedMemoryRanges(1, &range);
+		}
+
+		m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+		const IGPUDescriptorSet* sets[] = { m_ds.get() };
+		SPushConstantData storePc = {
+			.deviceBufferAddress = m_stagingBuffer->getDeviceAddress(),
+			.dstTileLocationsAddress = m_verifyDstTileLocationsBuffer->getDeviceAddress(),
+			.dstOffsetX = 0,
+			.dstOffsetY = 0,
+			.srcWidth = TILE_SIZE,
+			.srcHeight = TILE_SIZE,
+			.tilesPerRow = 1u
+		};
+
+		//SNAKE VERIFICATION
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> snakePreBarrier = {};
+			snakePreBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			snakePreBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			snakePreBarrier.image = m_destinationImage.get();
+			snakePreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			snakePreBarrier.subresourceRange.baseMipLevel = 0;
+			snakePreBarrier.subresourceRange.levelCount = 1;
+			snakePreBarrier.subresourceRange.baseArrayLayer = 0;
+			snakePreBarrier.subresourceRange.layerCount = 1;
+			snakePreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			snakePreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			snakePreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			snakePreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakePreBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_snakeStorePipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc);
+		m_cmdbuf->dispatch(1u, TILE_SIZE / 4u, 1u);
+
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> snakeMidBarrier = {};
+			snakeMidBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			snakeMidBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			snakeMidBarrier.image = m_destinationImage.get();
+			snakeMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			snakeMidBarrier.subresourceRange.baseMipLevel = 0;
+			snakeMidBarrier.subresourceRange.levelCount = 1;
+			snakeMidBarrier.subresourceRange.baseArrayLayer = 0;
+			snakeMidBarrier.subresourceRange.layerCount = 1;
+			snakeMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			snakeMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			snakeMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			snakeMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakeMidBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_snakeLoadPipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+
+		SPushConstantData snakeLoadPc = {
+			.deviceBufferAddress = m_snakeReadbackBuffer->getDeviceAddress(),
+			.dstTileLocationsAddress = m_verifyDstTileLocationsBuffer->getDeviceAddress(),
+			.dstOffsetX = 0,
+			.dstOffsetY = 0,
+			.srcWidth = TILE_SIZE,
+			.srcHeight = TILE_SIZE,
+			.tilesPerRow = 1u
+		};
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &snakeLoadPc);
+		m_cmdbuf->dispatch(1u, TILE_SIZE / 4u, 1u);
+
+		{
+			asset::SMemoryBarrier memBarrier = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+				.dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT
+			};
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+		}
+
+		//MORTON VERIFICATION
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> mortonPreBarrier = {};
+			mortonPreBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			mortonPreBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			mortonPreBarrier.image = m_destinationImage.get();
+			mortonPreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			mortonPreBarrier.subresourceRange.baseMipLevel = 0;
+			mortonPreBarrier.subresourceRange.levelCount = 1;
+			mortonPreBarrier.subresourceRange.baseArrayLayer = 0;
+			mortonPreBarrier.subresourceRange.layerCount = 1;
+			mortonPreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			mortonPreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			mortonPreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			mortonPreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonPreBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_mortonStorePipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc);
+		m_cmdbuf->dispatch(TILE_SIZE / 16u, TILE_SIZE / 16u, 1u);
+
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> mortonMidBarrier = {};
+			mortonMidBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			mortonMidBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			mortonMidBarrier.image = m_destinationImage.get();
+			mortonMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			mortonMidBarrier.subresourceRange.baseMipLevel = 0;
+			mortonMidBarrier.subresourceRange.levelCount = 1;
+			mortonMidBarrier.subresourceRange.baseArrayLayer = 0;
+			mortonMidBarrier.subresourceRange.layerCount = 1;
+			mortonMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			mortonMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			mortonMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			mortonMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonMidBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_mortonLoadPipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+
+		SPushConstantData mortonLoadPc = {
+			.deviceBufferAddress = m_mortonReadbackBuffer->getDeviceAddress(),
+			.dstTileLocationsAddress = m_verifyDstTileLocationsBuffer->getDeviceAddress(),
+			.dstOffsetX = 0,
+			.dstOffsetY = 0,
+			.srcWidth = TILE_SIZE,
+			.srcHeight = TILE_SIZE,
+			.tilesPerRow = 1u
+		};
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &mortonLoadPc);
+		m_cmdbuf->dispatch(TILE_SIZE / 16u, TILE_SIZE / 16u, 1u);
+
+		{
+			asset::SMemoryBarrier memBarrier = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+				.dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT
+			};
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+		}
+
+		m_cmdbuf->end();
+
+		// Submit and wait
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = m_sem.get(),
+			.value = semValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
+
+		m_api->startCapture();
+		m_queue->submit({ &submitInfo, 1 });
+		m_api->endCapture();
+
+		ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = semValue };
+		m_device->blockForSemaphores({ &waitInfo, 1 });
+
+		if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->invalidateMappedMemoryRanges(1, &range);
+		}
+		if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->invalidateMappedMemoryRanges(1, &range);
+		}
+
+		const uint32_t* srcPixels = static_cast<const uint32_t*>(m_stagingMappedPtr);
+		uint32_t totalPixels = TILE_SIZE * TILE_SIZE;
+
+		const uint32_t* snakeDstPixels = static_cast<const uint32_t*>(m_snakeReadbackMappedPtr);
+		uint32_t snakeMatchCount = 0;
+		uint32_t snakeFirstMismatchIdx = ~0u;
+
+		for (uint32_t i = 0; i < totalPixels; i++)
+		{
+			if (srcPixels[i] == snakeDstPixels[i])
+				snakeMatchCount++;
+			else if (snakeFirstMismatchIdx == ~0u)
+				snakeFirstMismatchIdx = i;
+		}
+
+		if (snakeMatchCount == totalPixels)
+		{
+			m_logger->log("Snake verification PASS. All %u pixels match.", ILogger::ELL_PERFORMANCE, totalPixels);
+		}
+		else
+		{
+			m_logger->log("Snake verification FAIL. %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X",
+				ILogger::ELL_ERROR, snakeMatchCount, totalPixels, snakeFirstMismatchIdx, srcPixels[snakeFirstMismatchIdx], snakeDstPixels[snakeFirstMismatchIdx]);
+			return false;
+		}
+
+		const uint32_t* mortonDstPixels = static_cast<const uint32_t*>(m_mortonReadbackMappedPtr);
+		uint32_t mortonMatchCount = 0;
+		uint32_t mortonFirstMismatchIdx = ~0u;
+
+		for (uint32_t i = 0; i < totalPixels; i++)
+		{
+			if (srcPixels[i] == mortonDstPixels[i])
+				mortonMatchCount++;
+			else if (mortonFirstMismatchIdx == ~0u)
+				mortonFirstMismatchIdx = i;
+		}
+
+		if (mortonMatchCount == totalPixels)
+		{
+			m_logger->log("Morton verification PASS. All %u pixels match.", ILogger::ELL_PERFORMANCE, totalPixels);
+		}
+		else
+		{
+			m_logger->log("Morton verification FAIL. %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X",
+				ILogger::ELL_ERROR, mortonMatchCount, totalPixels, mortonFirstMismatchIdx, srcPixels[mortonFirstMismatchIdx], mortonDstPixels[mortonFirstMismatchIdx]);
+			return false;
+		}
+
+		return true;
+	}
+};
+
+NBL_MAIN_FUNC(ImageUploadBenchmarkApp)
diff --git a/73_ImageUploadBenchmark/pipeline.groovy b/73_ImageUploadBenchmark/pipeline.groovy
new file mode 100644
index 000000000..1249f10b5
--- /dev/null
+++ b/73_ImageUploadBenchmark/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CImageUploadBenchmark extends IBuilder
+{
+	public CImageUploadBenchmark(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cbe482aa4..2d4ed7408 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
 	add_subdirectory(72_CooperativeBinarySearch)
+	add_subdirectory(73_ImageUploadBenchmark)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)