Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions 73_ImageUploadBenchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()

set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
set(DEPENDS
app_resources/common.hlsl
app_resources/tile_upload.comp.hlsl
)
target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)

set(SM 6_8)
set(JSON [=[
[
{
"INPUT": "app_resources/tile_upload.comp.hlsl",
"KEY": "snakeStore"
}
]
]=])
string(CONFIGURE "${JSON}" JSON)

NBL_CREATE_NSC_COMPILE_RULES(
TARGET ${EXECUTABLE_NAME}SPIRV
LINK_TO ${EXECUTABLE_NAME}
DEPENDS ${DEPENDS}
BINARY_DIR ${OUTPUT_DIRECTORY}
MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM}
OUTPUT_VAR KEYS
INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
NAMESPACE nbl::this_example::builtin::build
INPUTS ${JSON}
)

NBL_CREATE_RESOURCE_ARCHIVE(
NAMESPACE nbl::this_example::builtin::build
TARGET ${EXECUTABLE_NAME}_builtinsBuild
LINK_TO ${EXECUTABLE_NAME}
BIND ${OUTPUT_DIRECTORY}
BUILTINS ${KEYS}
)
11 changes: 11 additions & 0 deletions 73_ImageUploadBenchmark/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include <nbl/builtin/hlsl/morton.hlsl>

struct PushConstantData
{
uint64_t deviceBufferAddress;
uint64_t dstTileLocationsAddress;
uint32_t2 dstOffset;
uint32_t srcWidth;
uint32_t srcHeight;
uint32_t tilesPerRow;
};
99 changes: 99 additions & 0 deletions 73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#include "common.hlsl"

[[vk::binding(0,0)]] RWTexture2D<float32_t4> dstImage;
[[vk::push_constant]] PushConstantData pc;

using namespace nbl::hlsl;

static const uint32_t TILE_SIZE = 128u;
static const uint32_t TILE_SIZE_LOG2 = 7u;
static const uint32_t TILE_SIZE_MASK = TILE_SIZE - 1u;
static const uint32_t TILE_PIXELS_LOG2 = TILE_SIZE_LOG2 * 2u;
static const uint32_t BLOCK_SIZE = 16u;
static const uint32_t BLOCK_SIZE_LOG2 = 4u;
static const uint32_t BLOCK_PIXELS_LOG2 = BLOCK_SIZE_LOG2 * 2u;
static const uint32_t BLOCKS_PER_TILE_LOG2 = TILE_SIZE_LOG2 - BLOCK_SIZE_LOG2;
static const uint32_t BLOCKS_PER_TILE = TILE_SIZE / BLOCK_SIZE;

[numthreads(128, 4, 1)]
[shader("compute")]
void SnakeStore(uint32_t3 ID : SV_DispatchThreadID)
{
const uint32_t2 globalPos = ID.xy;
const uint32_t2 tileCoord = globalPos >> TILE_SIZE_LOG2;
const uint32_t2 localPos = globalPos & TILE_SIZE_MASK;
const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
const uint32_t localLinearIdx = (localPos.y << TILE_SIZE_LOG2) + localPos.x;
const uint32_t srcPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + localLinearIdx;
const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);
const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + localPos;

const uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + srcPixelIdx * 4u);

dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed));
}

[numthreads(128, 4, 1)]
[shader("compute")]
void SnakeLoad(uint32_t3 ID : SV_DispatchThreadID)
{
const uint32_t2 globalPos = ID.xy;
const uint32_t2 tileCoord = globalPos >> TILE_SIZE_LOG2;
const uint32_t2 localPos = globalPos & TILE_SIZE_MASK;
const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
const uint32_t localLinearIdx = (localPos.y << TILE_SIZE_LOG2) + localPos.x;
const uint32_t dstPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + localLinearIdx;
const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);
const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + localPos;

vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + dstPixelIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos])));
}

[numthreads(16, 16, 1)]
[shader("compute")]
void MortonStore(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
{
const uint32_t2 globalBlock = GroupID.xy;
const uint32_t2 threadPos = ID.xy;
const uint32_t2 tileCoord = globalBlock >> BLOCKS_PER_TILE_LOG2;
const uint32_t2 blockCoordInTile = globalBlock & (BLOCKS_PER_TILE - 1u);
const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
const uint32_t blockIdxInTile = (blockCoordInTile.y << BLOCKS_PER_TILE_LOG2) + blockCoordInTile.x;
const uint32_t localLinearIdx = (threadPos.y << BLOCK_SIZE_LOG2) + threadPos.x;
const uint32_t srcPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + (blockIdxInTile << BLOCK_PIXELS_LOG2) + localLinearIdx;
const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);

morton::code<false, 4, 2> mc;
mc.value = uint16_t(localLinearIdx);
const uint32_t2 mortonLocalPos = _static_cast<uint32_t2>(mc);
const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + (blockCoordInTile << BLOCK_SIZE_LOG2) + mortonLocalPos;

const uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + srcPixelIdx * 4u);
dstImage[pixelPos] = unpackUnorm4x8(int32_t(packed));
}

[numthreads(16, 16, 1)]
[shader("compute")]
void MortonLoad(uint32_t3 ID : SV_GroupThreadID, uint32_t3 GroupID : SV_GroupID)
{
const uint32_t2 globalBlock = GroupID.xy;
const uint32_t2 threadPos = ID.xy;
const uint32_t2 tileCoord = globalBlock >> BLOCKS_PER_TILE_LOG2;
const uint32_t2 blockCoordInTile = globalBlock & (BLOCKS_PER_TILE - 1u);
const uint32_t tileIdx = tileCoord.y * pc.tilesPerRow + tileCoord.x;
const uint32_t blockIdxInTile = (blockCoordInTile.y << BLOCKS_PER_TILE_LOG2) + blockCoordInTile.x;
const uint32_t localLinearIdx = (threadPos.y << BLOCK_SIZE_LOG2) + threadPos.x;
const uint32_t dstPixelIdx = (tileIdx << TILE_PIXELS_LOG2) + (blockIdxInTile << BLOCK_PIXELS_LOG2) + localLinearIdx;
const uint32_t packedDstTile = vk::RawBufferLoad<uint32_t>(pc.dstTileLocationsAddress + tileIdx * 4u);
const uint32_t2 dstTile = uint32_t2(packedDstTile & 0xffffu, packedDstTile >> 16u);

morton::code<false, 4, 2> mc;
mc.value = uint16_t(localLinearIdx);
const uint32_t2 mortonLocalPos = _static_cast<uint32_t2>(mc);
const uint32_t2 pixelPos = pc.dstOffset + (dstTile << TILE_SIZE_LOG2) + (blockCoordInTile << BLOCK_SIZE_LOG2) + mortonLocalPos;

vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + dstPixelIdx * 4u, uint32_t(packUnorm4x8(dstImage[pixelPos])));
}
28 changes: 28 additions & 0 deletions 73_ImageUploadBenchmark/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
Loading