Skip to content

Commit ae9c7ce

Browse files
committed
Merge branch 'main' into release/hip6.0_cuda12.2
2 parents 320e003 + ef26098 commit ae9c7ce

10 files changed

Lines changed: 141 additions & 35 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,7 @@ build/
2525
result.xml
2626
UnitTest/bitcodes/*.fatbin
2727
Test/SimpleD3D12/cache/**
28+
29+
ParallelPrimitives/cache/KernelArgs.h
30+
ParallelPrimitives/cache/Kernels.h
31+
ParallelPrimitives/cache/oro_compiled_kernels.h

Orochi/GpuMemory.h

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
namespace Oro
2929
{
3030

31-
/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls.
31+
/// @brief A helper function that casts an address of a pointer to the device memory to a void pointer to be used as an argument for kernel calls.
3232
/// @tparam T The type of the element stored in the device memory.
3333
/// @param ptr The address of a pointer to the device memory.
3434
/// @return A void pointer.
@@ -44,8 +44,8 @@ class GpuMemory final
4444
public:
4545
GpuMemory() = default;
4646

47-
/// @brief Allocate the device memory with the given size.
48-
/// @param init_size The initial size which represents the number of elements.
47+
/// @brief Allocate the elements on the device memory.
48+
/// @param init_size The initial container size which represents the number of elements.
4949
explicit GpuMemory( const size_t init_size )
5050
{
5151
OrochiUtils::malloc( m_data, init_size );
@@ -61,9 +61,9 @@ class GpuMemory final
6161

6262
GpuMemory& operator=( GpuMemory&& other ) noexcept
6363
{
64-
GpuMemory tmp( std::move( *this ) );
64+
GpuMemory tmp( std::move( other ) );
6565

66-
swap( *this, other );
66+
swap( *this, tmp );
6767

6868
return *this;
6969
}
@@ -79,8 +79,8 @@ class GpuMemory final
7979
m_capacity = 0ULL;
8080
}
8181

82-
/// @brief Get the size of the device memory.
83-
/// @return The size of the device memory.
82+
/// @brief Get the container size which represents the number of elements.
83+
/// @return The container size which represents the number of elements.
8484
size_t size() const noexcept { return m_size; }
8585

8686
/// @brief Get the pointer to the device memory.
@@ -91,9 +91,9 @@ class GpuMemory final
9191
/// @return The address of the pointer to the device memory.
9292
T* const* address() const noexcept { return &m_data; }
9393

94-
/// @brief Resize the device memory. Its capacity is unchanged if the new size is smaller than the current one.
94+
/// @brief Resize the container. Its capacity is unchanged if the new size is smaller than the current one.
9595
/// The old data should be considered invalid to be used after the function is called unless @c copy is set to True.
96-
/// @param new_size The new memory size after the function is called.
96+
/// @param new_size The new container size which represents the number of elements after the function is called.
9797
/// @param copy If true, the function will copy the data to the newly created memory space as well.
9898
void resize( const size_t new_size, const bool copy = false ) noexcept
9999
{
@@ -113,8 +113,8 @@ class GpuMemory final
113113
*this = std::move( tmp );
114114
}
115115

116-
/// @brief Asynchronous version of 'resize' using a given Orochi stream.
117-
/// @param new_size The new memory size after the function is called.
116+
/// @brief Asynchronous version of @c resize using a given Orochi stream.
117+
/// @param new_size The new container size which represents the number of elements after the function is called.
118118
/// @param copy If true, the function will copy the data to the newly created memory space as well.
119119
/// @param stream The Orochi stream used for the underlying operations.
120120
void resizeAsync( const size_t new_size, const bool copy = false, oroStream stream = 0 ) noexcept
@@ -138,7 +138,7 @@ class GpuMemory final
138138
/// @brief Reset the memory space so that all bits inside are cleared to zero.
139139
void reset() noexcept { OrochiUtils::memset( m_data, 0, m_size * sizeof( T ) ); }
140140

141-
/// @brief Asynchronous version of 'reset' using a given Orochi stream.
141+
/// @brief Asynchronous version of @c reset using a given Orochi stream.
142142
/// @param stream The Orochi stream used for the underlying operations.
143143
void resetAsync( oroStream stream = 0 ) noexcept { OrochiUtils::memsetAsync( m_data, 0, m_size * sizeof( T ), stream ); }
144144

Orochi/OrochiUtils.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,41 @@ oroFunction OrochiUtils::getFunctionFromString( oroDevice device, const char* so
558558
return f;
559559
}
560560

561+
oroFunction OrochiUtils::getFunctionFromPrecompiledBinary_asData( const unsigned char* precompData, size_t dataSizeInBytes, const std::string& funcName )
562+
{
563+
std::lock_guard<std::recursive_mutex> lock( m_mutex );
564+
565+
const std::string cacheName = OrochiUtilsImpl::getCacheName( "___BAKED_BIN___", funcName );
566+
if( m_kernelMap.find( cacheName.c_str() ) != m_kernelMap.end() )
567+
{
568+
return m_kernelMap[cacheName].function;
569+
}
570+
571+
oroModule module = nullptr;
572+
oroError e = oroModuleLoadData( &module, precompData );
573+
if ( e != oroSuccess )
574+
{
575+
// add some verbose info to help debugging missing data
576+
printf("oroModuleLoadData FAILED (error = %d) loading baked precomp data: %s\n", e, funcName.c_str());
577+
return nullptr;
578+
}
579+
580+
oroFunction functionOut{};
581+
e = oroModuleGetFunction( &functionOut, module, funcName.c_str() );
582+
if ( e != oroSuccess )
583+
{
584+
// add some verbose info to help debugging missing data
585+
printf("oroModuleGetFunction FAILED (error = %d) loading baked precomp data: %s\n", e, funcName.c_str());
586+
return nullptr;
587+
}
588+
OROASSERT( e == oroSuccess, 0 );
589+
590+
m_kernelMap[cacheName].function = functionOut;
591+
m_kernelMap[cacheName].module = module;
592+
593+
return functionOut;
594+
}
595+
561596
oroFunction OrochiUtils::getFunctionFromPrecompiledBinary( const std::string& path, const std::string& funcName )
562597
{
563598
std::lock_guard<std::recursive_mutex> lock( m_mutex );

Orochi/OrochiUtils.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ class OrochiUtils
6969

7070
oroFunction getFunctionFromPrecompiledBinary( const std::string& path, const std::string& funcName );
7171

72+
// this function is like 'getFunctionFromPrecompiledBinary' but instead of giving a path to a file, we give the data directly.
73+
// ( use the script convert_binary_to_array.py to convert the .hipfb to a C-array. )
74+
oroFunction getFunctionFromPrecompiledBinary_asData( const unsigned char* data, size_t dataSizeInBytes, const std::string& funcName );
75+
7276
oroFunction getFunctionFromFile( oroDevice device, const char* path, const char* funcName, std::vector<const char*>* opts );
7377
oroFunction getFunctionFromString( oroDevice device, const char* source, const char* path, const char* funcName, std::vector<const char*>* opts, int numHeaders, const char** headers, const char** includeNames );
7478
oroFunction getFunction( oroDevice device, const char* code, const char* path, const char* funcName, std::vector<const char*>* opts, int numHeaders = 0, const char** headers = 0, const char** includeNames = 0, oroModule* loadedModule = 0 );

ParallelPrimitives/RadixSort.cpp

Lines changed: 50 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,37 +27,63 @@
2727
#include <iostream>
2828
#include <numeric>
2929

30-
#if defined( ORO_PP_LOAD_FROM_STRING )
31-
30+
// if ORO_PP_LOAD_FROM_STRING && ORO_PRECOMPILED -> we load the precompiled/baked kernels.
31+
// if ORO_PP_LOAD_FROM_STRING && NOT ORO_PRECOMPILED -> we load the baked source code kernels (from Kernels.h / KernelArgs.h)
32+
#if !defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING )
3233
// Note: the include order must be in this particular form.
3334
// clang-format off
3435
#include <ParallelPrimitives/cache/Kernels.h>
3536
#include <ParallelPrimitives/cache/KernelArgs.h>
3637
// clang-format on
38+
#else
39+
// if Kernels.h / KernelArgs.h are not included, declare nullptr strings
40+
static const char* hip_RadixSortKernels = nullptr;
41+
namespace hip
42+
{
43+
static const char** RadixSortKernelsArgs = nullptr;
44+
static const char** RadixSortKernelsIncludes = nullptr;
45+
}
3746
#endif
3847

3948
#if defined( __GNUC__ )
4049
#include <dlfcn.h>
4150
#endif
4251

43-
namespace
44-
{
45-
#if defined( ORO_PRECOMPILED )
46-
constexpr auto useBitCode = true;
52+
#if defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING )
53+
#include <ParallelPrimitives/cache/oro_compiled_kernels.h> // generate this header with 'convert_binary_to_array.py'
4754
#else
48-
constexpr auto useBitCode = false;
55+
const unsigned char oro_compiled_kernels_h[] = "";
56+
const size_t oro_compiled_kernels_h_size = 0;
4957
#endif
5058

51-
#if defined( ORO_PP_LOAD_FROM_STRING )
52-
constexpr auto useBakeKernel = true;
53-
#else
54-
constexpr auto useBakeKernel = false;
55-
static const char* hip_RadixSortKernels = nullptr;
56-
namespace hip
59+
namespace
5760
{
58-
static const char** RadixSortKernelsArgs = nullptr;
59-
static const char** RadixSortKernelsIncludes = nullptr;
60-
} // namespace hip
61+
62+
// if those 2 preprocessors are enabled, this activates the 'usePrecompiledAndBakedKernel' mode.
63+
#if defined( ORO_PRECOMPILED ) && defined( ORO_PP_LOAD_FROM_STRING )
64+
65+
// this flag means that we bake the precompiled kernels
66+
constexpr auto usePrecompiledAndBakedKernel = true;
67+
68+
constexpr auto useBitCode = false;
69+
constexpr auto useBakeKernel = false;
70+
71+
#else
72+
73+
constexpr auto usePrecompiledAndBakedKernel = false;
74+
75+
#if defined( ORO_PRECOMPILED )
76+
constexpr auto useBitCode = true; // this flag means we use the bitcode file
77+
#else
78+
constexpr auto useBitCode = false;
79+
#endif
80+
81+
#if defined( ORO_PP_LOAD_FROM_STRING )
82+
constexpr auto useBakeKernel = true; // this flag means we use the HIP source code embeded in the binary ( as a string )
83+
#else
84+
constexpr auto useBakeKernel = false;
85+
#endif
86+
6187
#endif
6288

6389
static_assert( !( useBitCode && useBakeKernel ), "useBitCode and useBakeKernel cannot coexist" );
@@ -211,9 +237,14 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
211237
opts.push_back( sort_block_size_param.c_str() );
212238
opts.push_back( sort_num_warps_param.c_str() );
213239

240+
214241
for( const auto& record : records )
215242
{
216-
if constexpr( useBakeKernel )
243+
if constexpr( usePrecompiledAndBakedKernel )
244+
{
245+
oroFunctions[record.kernelType] = m_oroutils.getFunctionFromPrecompiledBinary_asData(oro_compiled_kernels_h, oro_compiled_kernels_h_size, record.kernelName.c_str() );
246+
}
247+
else if constexpr( useBakeKernel )
217248
{
218249
oroFunctions[record.kernelType] = m_oroutils.getFunctionFromString( m_device, hip_RadixSortKernels, currentKernelPath.c_str(), record.kernelName.c_str(), &opts, 1, hip::RadixSortKernelsArgs, hip::RadixSortKernelsIncludes );
219250
}
@@ -231,6 +262,8 @@ void RadixSort::compileKernels( const std::string& kernelPath, const std::string
231262
printKernelInfo( record.kernelName, oroFunctions[record.kernelType] );
232263
}
233264
}
265+
266+
return;
234267
}
235268

236269
int RadixSort::calculateWGsToExecute( const int blockSize ) const noexcept

Test/DeviceEnum/main.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ int main( int argc, char** argv )
6666
e = oroCtxCreate( &ctx, 0, device );
6767
ERROR_CHECK( e );
6868

69+
e = oroCtxSetCurrent( ctx );
70+
ERROR_CHECK( e );
71+
6972
//try kernel execution
7073
oroFunction function;
7174
{
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:8e6b58a3484af4b2104fbe8bc1030e9c04f897b82bb1e41976aee511b379cf9c
3-
size 110272952
2+
oid sha256:d99d86fe78b719ca2f0502da3f37f5d41ef3f9efa0bc5ebf39c129cc31a9653d
3+
size 110273240
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:f2797ba9fa817f9a8f2cb3caa0f6dcdffdcb3b4c3dd353258536f7ac1522e727
3-
size 1065400
2+
oid sha256:cc29cf387cf1e1b59826f4ffe21922f849f47378cea3ec2ce00dc2ad38254d95
3+
size 1065688

contrib/bin/win64/hiprtc0601.dll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:893446666ddd9c076b24affc34d48b06332b212ec04ff42d568d25da2b168610
3-
size 1939896
2+
oid sha256:4ac7b741cf60afc4de24a07f15c3bd2fce5e052e0be8497960b75004a6d2de11
3+
size 1940184

scripts/convert_binary_to_array.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# convert_binary_to_header.py
2+
import sys
3+
from pathlib import Path
4+
5+
def binary_to_c_array(bin_file, array_name):
6+
with open(bin_file, 'rb') as f:
7+
binary_data = f.read()
8+
9+
hex_array = ', '.join(f'0x{b:02x}' for b in binary_data)
10+
c_array = f'const unsigned char {array_name}[] = {{\n {hex_array}\n}};\n'
11+
c_array += f'const size_t {array_name}_size = sizeof({array_name});\n'
12+
return c_array
13+
14+
if __name__ == "__main__":
15+
if len(sys.argv) != 3:
16+
print(f"Usage: {sys.argv[0]} <input_binary_file> <output_header_file>")
17+
sys.exit(1)
18+
19+
bin_file = sys.argv[1]
20+
header_file_path = sys.argv[2]
21+
header_file = Path(header_file_path).name
22+
array_name = header_file.replace('.', '_')
23+
24+
c_array = binary_to_c_array(bin_file, array_name)
25+
with open(header_file_path, 'w') as f:
26+
f.write("// generated by convert_binary_to_header.py\n")
27+
f.write(c_array)

0 commit comments

Comments
 (0)