diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index d9d22863f1..31ddd4e008 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -73,11 +73,13 @@ struct MatrixParams { bool Enable16Bit; bool EmulateTest; - size_t strideBytes() const { + size_t rowStride() const { uint32_t ES = elementSize(CompType); if (Layout == LinalgMatrixLayout::RowMajor) return N * ES; - return M * ES; + if (Layout == LinalgMatrixLayout::ColumnMajor) + return M * ES; + return 0; } size_t totalElements() const { return M * N; } @@ -94,7 +96,7 @@ static std::string buildCompilerArgs(const MatrixParams &Params, SS << " -DN_DIM=" << Params.N; SS << " -DUSE=" << static_cast(Params.Use); SS << " -DSCOPE=" << static_cast(Params.Scope); - SS << " -DSTRIDE=" << Params.strideBytes(); + SS << " -DSTRIDE=" << Params.rowStride(); SS << " -DLAYOUT=" << static_cast(Params.Layout); SS << " -DELEM_SIZE=" << static_cast(elementSize(Params.CompType)); SS << " -DNUMTHREADS=" << Params.NumThreads; @@ -320,7 +322,6 @@ class DxilConf_SM610_LinAlg { TEST_METHOD(LoadStoreDescriptor_Wave_16x16_F16); TEST_METHOD(SplatStore_Wave_16x16_F16); TEST_METHOD(AccumulateDescriptor_Wave_16x16_F16); - TEST_METHOD(AccumulateDescriptor_Thread_16x16_F16); // Load/Store/Accumulate Memory TEST_METHOD(LoadMemory_Wave_16x16_F16); @@ -537,6 +538,9 @@ void DxilConf_SM610_LinAlg::SplatStore_Wave_16x16_F16() { runSplatStore(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging); } +// Since MatrixAccumulateToDescriptor requires an accumulator matrix and +// MatrixLoadFromDescriptor always returns an A matrix when loading a Thread +// matrix this shader only makes sense for Wave/ThreadGroup static const char AccumulateDescriptorShader[] = R"( #define USE_ACC 2 @@ -613,19 +617,6 @@ void DxilConf_SM610_LinAlg::AccumulateDescriptor_Wave_16x16_F16() { runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 12, VerboseLogging); } -void DxilConf_SM610_LinAlg::AccumulateDescriptor_Thread_16x16_F16() { - MatrixParams Params = {}; - Params.CompType = ComponentType::F16; - Params.M = 16; - Params.N = 16; - Params.Use = MatrixUse::Accumulator; - Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::RowMajor; - Params.NumThreads = 1; - Params.Enable16Bit = true; - runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 19, VerboseLogging); -} - static const char ElementAccessShader[] = R"( RWByteAddressBuffer Input : register(u0); RWByteAddressBuffer Output : register(u1); @@ -1324,8 +1315,14 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { } static const char OuterProductShader[] = R"( - #define USE_A 0 + // OuterProduct Matrix must be Thread scope #define SCOPE_THREAD 0 + // OuterProduct/Accumulate must be Accumulator use + #define USE_ACC 2 + // Accumulate Layout must be OuterProductOptimal + #define LAYOUT_OUTER_PROD_OPT 4 + // Accumulate Stride msut be 0 for non Row/Col Major + #define STRIDE 0 RWByteAddressBuffer Input : register(u0); RWByteAddressBuffer Output : register(u1); @@ -1345,12 +1342,12 @@ static const char OuterProductShader[] = R"( } __builtin_LinAlgMatrix - [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]] + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE_THREAD)]] Mat; __builtin_LinAlg_MatrixOuterProduct(Mat, VecA, VecB); __builtin_LinAlg_MatrixAccumulateToDescriptor( - Mat, Output, 0, STRIDE, LAYOUT, 128); + Mat, Output, 0, STRIDE, LAYOUT_OUTER_PROD_OPT, 128); } )"; @@ -1398,8 +1395,9 @@ void DxilConf_SM610_LinAlg::OuterProduct_Thread_16x16_F16() { Params.CompType = ComponentType::F16; Params.M = 16; Params.N = 16; + Params.Use = MatrixUse::Accumulator; Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::RowMajor; + Params.Layout = LinalgMatrixLayout::OuterProductOptimal; Params.NumThreads = 1; Params.Enable16Bit = true; runOuterProduct(D3DDevice, DxcSupport, Params, VerboseLogging);