From de8e26aa71cc276d40093c8d01a109cfc521dde0 Mon Sep 17 00:00:00 2001 From: Richard Geslot Date: Sun, 20 Apr 2025 22:28:13 +0200 Subject: [PATCH 1/3] new feature: compress precompiled --- Orochi/OrochiUtils.cpp | 36 ++++++++++++++++++++++++++++++ Orochi/OrochiUtils.h | 3 +++ ParallelPrimitives/RadixSort.cpp | 5 +++-- scripts/convert_binary_to_array.py | 30 +++++++++++++++++-------- scripts/create_archive.cmake | 22 ++++++++++++++++++ 5 files changed, 85 insertions(+), 11 deletions(-) create mode 100644 scripts/create_archive.cmake diff --git a/Orochi/OrochiUtils.cpp b/Orochi/OrochiUtils.cpp index 7c2fcd9..8403f0f 100644 --- a/Orochi/OrochiUtils.cpp +++ b/Orochi/OrochiUtils.cpp @@ -36,6 +36,10 @@ #include #endif +#ifdef ORO_LINK_ZSTD +#include +#endif + inline std::wstring utf8_to_wstring( const std::string& str ) { std::wstring_convert> myconv; @@ -790,3 +794,35 @@ void OrochiUtils::launch2D( oroFunction func, int nx, int ny, const void** args, OROASSERT( e == oroSuccess, 0 ); } +void OrochiUtils::DecompressPrecompiled(std::vector& out, const unsigned char* compressedInput, size_t compressedInput_sizeByte, size_t uncompressed_sizeByte) +{ + if ( uncompressed_sizeByte > 0 ) // if the input data is actually compressed + { + #ifdef ORO_LINK_ZSTD + out.assign(uncompressed_sizeByte,0); + + size_t decompressedSize = ZSTD_decompress( + out.data(), // final uncompressed buffer + out.size(), // final size + compressedInput, // compressed buffer + compressedInput_sizeByte // compressed buffer - size + ); + + if ( decompressedSize != uncompressed_sizeByte ) + throw std::runtime_error( "ERROR: ZSTD_decompress FAILED." ); + #else + + throw std::runtime_error( "ERROR: ZSTD is not part of this build." ); + + #endif + + } + else // if the input data is NOT compressed, buypass this decompress process. + { + out = std::vector(compressedInput, compressedInput + compressedInput_sizeByte ); + } + return; +} + + + diff --git a/Orochi/OrochiUtils.h b/Orochi/OrochiUtils.h index e8ca8cf..528e1f0 100644 --- a/Orochi/OrochiUtils.h +++ b/Orochi/OrochiUtils.h @@ -83,6 +83,9 @@ class OrochiUtils static void getModule( oroDevice device, const char* code, const char* path, std::vector* optsIn, const char* funcName, oroModule* moduleOut ); static void launch1D( oroFunction func, int nx, const void** args, int wgSize = 64, unsigned int sharedMemBytes = 0, oroStream stream = 0 ); static void launch2D( oroFunction func, int nx, int ny, const void** args, int wgSizeX = 8, int wgSizeY = 8, unsigned int sharedMemBytes = 0, oroStream stream = 0 ); + + // if 'uncompressed_sizeByte' is set to 0, it means the input value is not compressed and this function will output the raw buffer. + static void DecompressPrecompiled(std::vector& out, const unsigned char* compressedInput, size_t compressedInput_sizeByte, size_t uncompressed_sizeByte); template static void malloc( T*& ptr, size_t n ) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index b99ad8d..723b0fd 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -54,6 +54,7 @@ static const char** RadixSortKernelsIncludes = nullptr; #else const unsigned char oro_compiled_kernels_h[] = ""; const size_t oro_compiled_kernels_h_size = 0; +const size_t oro_compiled_kernels_h_size_uncompressed = 0; #endif constexpr uint64_t div_round_up64( uint64_t val, uint64_t divisor ) noexcept { return ( val + divisor - 1 ) / divisor; } @@ -189,8 +190,8 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string { if constexpr( usePrecompiledAndBakedKernel ) { - // Move the raw buffer into a std::vector, which avoids potential issues explained here: github.com/GPUOpen-LibrariesAndSDKs/HIPRT/pull/38#issuecomment-2761698032 - std::vector binary(oro_compiled_kernels_h, oro_compiled_kernels_h + oro_compiled_kernels_h_size); + std::vector binary; + OrochiUtils::DecompressPrecompiled(binary, oro_compiled_kernels_h, oro_compiled_kernels_h_size, oro_compiled_kernels_h_size_uncompressed); oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary_asData(binary.data(), binary.size(), record.kernelName.c_str() ); } else if constexpr( useBakeKernel ) diff --git a/scripts/convert_binary_to_array.py b/scripts/convert_binary_to_array.py index baab3b8..3c281d7 100644 --- a/scripts/convert_binary_to_array.py +++ b/scripts/convert_binary_to_array.py @@ -1,27 +1,39 @@ -# convert_binary_to_header.py +# convert_binary_to_array.py import sys from pathlib import Path -def binary_to_c_array(bin_file, array_name): +def binary_to_c_array(bin_file, array_name, size_BeforeCompression, compression_activated): with open(bin_file, 'rb') as f: binary_data = f.read() hex_array = ', '.join(f'0x{b:02x}' for b in binary_data) c_array = f'const unsigned char {array_name}[] = {{\n {hex_array}\n}};\n' - c_array += f'const size_t {array_name}_size = sizeof({array_name});\n' + c_array += f'const size_t {array_name}_size = sizeof({array_name}); // {len(binary_data)}\n' + + if not compression_activated: + size_BeforeCompression = 0 # set value to 0 if we are not using compression. + c_array += f'const size_t {array_name}_size_uncompressed = {size_BeforeCompression}; // set to 0 if NOT using the ZSTD compression.\n' return c_array if __name__ == "__main__": - if len(sys.argv) != 3: - print(f"Usage: {sys.argv[0]} ") + if len(sys.argv) != 5: + print(f"Usage: {sys.argv[0]} ") sys.exit(1) - bin_file = sys.argv[1] - header_file_path = sys.argv[2] + bin_file_beforeCompression = sys.argv[1] + bin_file_afterCompression = sys.argv[2] # not used if 'compression_activated' is OFF + header_file_path = sys.argv[3] + compression_activated = sys.argv[4].lower() == "on" # sys.argv[4] should be "ON" or "OFF" + header_file = Path(header_file_path).name array_name = header_file.replace('.', '_') - c_array = binary_to_c_array(bin_file, array_name) + if not compression_activated: + bin_file_afterCompression = bin_file_beforeCompression + + c_array = binary_to_c_array(bin_file_afterCompression, array_name, Path(bin_file_beforeCompression).stat().st_size, compression_activated ) with open(header_file_path, 'w') as f: - f.write("// generated by convert_binary_to_header.py\n") + f.write("// generated by convert_binary_to_array.py\n") + if compression_activated: + f.write(f"// Data is compressed.\n") f.write(c_array) diff --git a/scripts/create_archive.cmake b/scripts/create_archive.cmake new file mode 100644 index 0000000..fa3e432 --- /dev/null +++ b/scripts/create_archive.cmake @@ -0,0 +1,22 @@ + +# create_archive.cmake +# Create a raw Zstd-compressed "archive" from a single file. + +# Variables expected: +# INPUT_FILE – path to the file to compress +# OUTPUT_FILE – path to the compressed file to generate +# DO_COMPRESS: ON/OFF + + +if(DO_COMPRESS) + file(ARCHIVE_CREATE + OUTPUT "${OUTPUT_FILE}" + PATHS "${INPUT_FILE}" + FORMAT raw + COMPRESSION Zstd + COMPRESSION_LEVEL 19 + ) +endif() + + + From 5c0cb3285c9e3b28798f665b500dec24b4d9296d Mon Sep 17 00:00:00 2001 From: Richard Geslot Date: Tue, 22 Apr 2025 11:39:55 +0200 Subject: [PATCH 2/3] precompiled compression - lower required cmake version --- scripts/create_archive.cmake | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/create_archive.cmake b/scripts/create_archive.cmake index fa3e432..35d6a52 100644 --- a/scripts/create_archive.cmake +++ b/scripts/create_archive.cmake @@ -9,12 +9,13 @@ if(DO_COMPRESS) + message("Compress ${INPUT_FILE} ...") file(ARCHIVE_CREATE OUTPUT "${OUTPUT_FILE}" PATHS "${INPUT_FILE}" FORMAT raw COMPRESSION Zstd - COMPRESSION_LEVEL 19 + COMPRESSION_LEVEL 9 # 0-9 for cmake >= 3.19 or 0-19 for cmake >= 3.26 ) endif() From cfd066fcf564df2c1bd0f31a09aa3c5c48c93fd9 Mon Sep 17 00:00:00 2001 From: Richard Geslot Date: Fri, 25 Apr 2025 13:44:23 +0200 Subject: [PATCH 3/3] baked kernel compression - improve coding --- Orochi/OrochiUtils.cpp | 38 +++++++++++++++++++----------- Orochi/OrochiUtils.h | 16 +++++++++++-- ParallelPrimitives/RadixSort.cpp | 3 ++- scripts/convert_binary_to_array.py | 14 ++++++++--- 4 files changed, 51 insertions(+), 20 deletions(-) diff --git a/Orochi/OrochiUtils.cpp b/Orochi/OrochiUtils.cpp index 8403f0f..4604c71 100644 --- a/Orochi/OrochiUtils.cpp +++ b/Orochi/OrochiUtils.cpp @@ -794,35 +794,45 @@ void OrochiUtils::launch2D( oroFunction func, int nx, int ny, const void** args, OROASSERT( e == oroSuccess, 0 ); } -void OrochiUtils::DecompressPrecompiled(std::vector& out, const unsigned char* compressedInput, size_t compressedInput_sizeByte, size_t uncompressed_sizeByte) +void OrochiUtils::HandlePrecompiled(std::vector& out, const CompressedBuffer& buffer) { - if ( uncompressed_sizeByte > 0 ) // if the input data is actually compressed - { #ifdef ORO_LINK_ZSTD - out.assign(uncompressed_sizeByte,0); + out.assign(buffer.uncompressedSize,0); size_t decompressedSize = ZSTD_decompress( out.data(), // final uncompressed buffer out.size(), // final size - compressedInput, // compressed buffer - compressedInput_sizeByte // compressed buffer - size + buffer.data, // compressed buffer + buffer.size // compressed buffer - size ); - if ( decompressedSize != uncompressed_sizeByte ) + if ( decompressedSize != buffer.uncompressedSize ) throw std::runtime_error( "ERROR: ZSTD_decompress FAILED." ); #else - throw std::runtime_error( "ERROR: ZSTD is not part of this build." ); - #endif + return; +} - } - else // if the input data is NOT compressed, buypass this decompress process. - { - out = std::vector(compressedInput, compressedInput + compressedInput_sizeByte ); - } + +void OrochiUtils::HandlePrecompiled(std::vector& out, const RawBuffer& buffer) +{ + out = std::vector(buffer.data, buffer.data + buffer.size ); return; } +void OrochiUtils::HandlePrecompiled(std::vector& out, const unsigned char* rawData, size_t rawData_sizeByte, std::optional uncompressed_sizeByte) +{ + if (uncompressed_sizeByte.has_value()) { + // if the input buffer is compressed : + CompressedBuffer buffer{ rawData, rawData_sizeByte, uncompressed_sizeByte.value() }; + HandlePrecompiled(out, buffer ); + } else { + // if the input buffer is not compressed + RawBuffer buffer{ rawData, rawData_sizeByte }; + HandlePrecompiled(out, buffer ); + } +} + diff --git a/Orochi/OrochiUtils.h b/Orochi/OrochiUtils.h index 528e1f0..e5f5776 100644 --- a/Orochi/OrochiUtils.h +++ b/Orochi/OrochiUtils.h @@ -27,6 +27,7 @@ #include #include #include +#include #if defined( GNUC ) #include @@ -84,8 +85,19 @@ class OrochiUtils static void launch1D( oroFunction func, int nx, const void** args, int wgSize = 64, unsigned int sharedMemBytes = 0, oroStream stream = 0 ); static void launch2D( oroFunction func, int nx, int ny, const void** args, int wgSizeX = 8, int wgSizeY = 8, unsigned int sharedMemBytes = 0, oroStream stream = 0 ); - // if 'uncompressed_sizeByte' is set to 0, it means the input value is not compressed and this function will output the raw buffer. - static void DecompressPrecompiled(std::vector& out, const unsigned char* compressedInput, size_t compressedInput_sizeByte, size_t uncompressed_sizeByte); + + struct CompressedBuffer { + const unsigned char* data = nullptr; // compressed data + size_t size = 0; // size in byte of 'data' + size_t uncompressedSize = 0; // size of byte of the uncompressed data. + }; + struct RawBuffer { + const unsigned char* data = nullptr; + size_t size = 0; + }; + static void HandlePrecompiled(std::vector& out, const CompressedBuffer& buffer); + static void HandlePrecompiled(std::vector& out, const RawBuffer& buffer); + static void HandlePrecompiled(std::vector& out, const unsigned char* rawData, size_t rawData_sizeByte, std::optional uncompressed_sizeByte=std::nullopt); template static void malloc( T*& ptr, size_t n ) diff --git a/ParallelPrimitives/RadixSort.cpp b/ParallelPrimitives/RadixSort.cpp index 723b0fd..f9f7011 100644 --- a/ParallelPrimitives/RadixSort.cpp +++ b/ParallelPrimitives/RadixSort.cpp @@ -55,6 +55,7 @@ static const char** RadixSortKernelsIncludes = nullptr; const unsigned char oro_compiled_kernels_h[] = ""; const size_t oro_compiled_kernels_h_size = 0; const size_t oro_compiled_kernels_h_size_uncompressed = 0; +const bool oro_compiled_kernels_h_isCompressed = false; #endif constexpr uint64_t div_round_up64( uint64_t val, uint64_t divisor ) noexcept { return ( val + divisor - 1 ) / divisor; } @@ -191,7 +192,7 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string if constexpr( usePrecompiledAndBakedKernel ) { std::vector binary; - OrochiUtils::DecompressPrecompiled(binary, oro_compiled_kernels_h, oro_compiled_kernels_h_size, oro_compiled_kernels_h_size_uncompressed); + OrochiUtils::HandlePrecompiled(binary, oro_compiled_kernels_h, oro_compiled_kernels_h_size, oro_compiled_kernels_h_isCompressed ? std::optional{oro_compiled_kernels_h_size_uncompressed} : std::nullopt); oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary_asData(binary.data(), binary.size(), record.kernelName.c_str() ); } else if constexpr( useBakeKernel ) diff --git a/scripts/convert_binary_to_array.py b/scripts/convert_binary_to_array.py index 3c281d7..190fcba 100644 --- a/scripts/convert_binary_to_array.py +++ b/scripts/convert_binary_to_array.py @@ -10,9 +10,17 @@ def binary_to_c_array(bin_file, array_name, size_BeforeCompression, compression_ c_array = f'const unsigned char {array_name}[] = {{\n {hex_array}\n}};\n' c_array += f'const size_t {array_name}_size = sizeof({array_name}); // {len(binary_data)}\n' - if not compression_activated: - size_BeforeCompression = 0 # set value to 0 if we are not using compression. - c_array += f'const size_t {array_name}_size_uncompressed = {size_BeforeCompression}; // set to 0 if NOT using the ZSTD compression.\n' + c_array += f'const size_t {array_name}_size_uncompressed = ' + if compression_activated: + c_array += f'{size_BeforeCompression}; // size of the data in bytes, once it has been uncompressed.\n' + else: + c_array += f'{array_name}_size; // same than raw buffer, because data is not compressed.\n' + + c_array += f'const bool {array_name}_isCompressed = ' + if compression_activated: + c_array += f'true;\n' + else: + c_array += f'false;\n' return c_array if __name__ == "__main__":