From 90b5414a1653b2645478ea7c75c499b4ac25e3e4 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Fri, 17 Apr 2026 11:48:53 -0600 Subject: [PATCH 1/2] [SM6.10][Exec][Bugfix] Thread mats should be OuterProductOptimal layout --- tools/clang/unittests/HLSLExec/LinAlgTests.cpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index d9d22863f1..fb5412e143 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -73,11 +73,13 @@ struct MatrixParams { bool Enable16Bit; bool EmulateTest; - size_t strideBytes() const { + size_t rowStride() const { uint32_t ES = elementSize(CompType); if (Layout == LinalgMatrixLayout::RowMajor) return N * ES; - return M * ES; + if (Layout == LinalgMatrixLayout::ColumnMajor) + return M * ES; + return 0; } size_t totalElements() const { return M * N; } @@ -94,7 +96,7 @@ static std::string buildCompilerArgs(const MatrixParams &Params, SS << " -DN_DIM=" << Params.N; SS << " -DUSE=" << static_cast(Params.Use); SS << " -DSCOPE=" << static_cast(Params.Scope); - SS << " -DSTRIDE=" << Params.strideBytes(); + SS << " -DSTRIDE=" << Params.rowStride(); SS << " -DLAYOUT=" << static_cast(Params.Layout); SS << " -DELEM_SIZE=" << static_cast(elementSize(Params.CompType)); SS << " -DNUMTHREADS=" << Params.NumThreads; @@ -620,7 +622,7 @@ void DxilConf_SM610_LinAlg::AccumulateDescriptor_Thread_16x16_F16() { Params.N = 16; Params.Use = MatrixUse::Accumulator; Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::RowMajor; + Params.Layout = LinalgMatrixLayout::OuterProductOptimal; Params.NumThreads = 1; Params.Enable16Bit = true; runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 19, VerboseLogging); @@ -1220,7 +1222,7 @@ void DxilConf_SM610_LinAlg::MatVecMul_Thread_16x16_F16() { Params.M = 16; Params.N = 16; Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::RowMajor; + Params.Layout = LinalgMatrixLayout::OuterProductOptimal; Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMul(D3DDevice, DxcSupport, Params, VerboseLogging, @@ -1315,7 +1317,7 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { Params.M = 16; Params.N = 16; Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::RowMajor; + Params.Layout = LinalgMatrixLayout::OuterProductOptimal; Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMulAdd(D3DDevice, DxcSupport, Params, VerboseLogging, @@ -1399,7 +1401,7 @@ void DxilConf_SM610_LinAlg::OuterProduct_Thread_16x16_F16() { Params.M = 16; Params.N = 16; Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::RowMajor; + Params.Layout = LinalgMatrixLayout::OuterProductOptimal; Params.NumThreads = 1; Params.Enable16Bit = true; runOuterProduct(D3DDevice, DxcSupport, Params, VerboseLogging); From 2c71946192983c5a5e55f2d69c8e5b88ebe7c8d4 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Fri, 17 Apr 2026 18:01:31 -0600 Subject: [PATCH 2/2] More test fixes --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 34 ++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index fb5412e143..31ddd4e008 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -322,7 +322,6 @@ class DxilConf_SM610_LinAlg { TEST_METHOD(LoadStoreDescriptor_Wave_16x16_F16); TEST_METHOD(SplatStore_Wave_16x16_F16); TEST_METHOD(AccumulateDescriptor_Wave_16x16_F16); - TEST_METHOD(AccumulateDescriptor_Thread_16x16_F16); // Load/Store/Accumulate Memory TEST_METHOD(LoadMemory_Wave_16x16_F16); @@ -539,6 +538,9 @@ void DxilConf_SM610_LinAlg::SplatStore_Wave_16x16_F16() { runSplatStore(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging); } +// Since MatrixAccumulateToDescriptor requires an accumulator matrix and +// MatrixLoadFromDescriptor always returns an A matrix when loading a Thread +// matrix this shader only makes sense for Wave/ThreadGroup static const char AccumulateDescriptorShader[] = R"( #define USE_ACC 2 @@ -615,19 +617,6 @@ void DxilConf_SM610_LinAlg::AccumulateDescriptor_Wave_16x16_F16() { runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 12, VerboseLogging); } -void DxilConf_SM610_LinAlg::AccumulateDescriptor_Thread_16x16_F16() { - MatrixParams Params = {}; - Params.CompType = ComponentType::F16; - Params.M = 16; - Params.N = 16; - Params.Use = MatrixUse::Accumulator; - Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::OuterProductOptimal; - Params.NumThreads = 1; - Params.Enable16Bit = true; - runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 19, VerboseLogging); -} - static const char ElementAccessShader[] = R"( RWByteAddressBuffer Input : register(u0); RWByteAddressBuffer Output : register(u1); @@ -1222,7 +1211,7 @@ void DxilConf_SM610_LinAlg::MatVecMul_Thread_16x16_F16() { Params.M = 16; Params.N = 16; Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::OuterProductOptimal; + Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMul(D3DDevice, DxcSupport, Params, VerboseLogging, @@ -1317,7 +1306,7 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { Params.M = 16; Params.N = 16; Params.Scope = MatrixScope::Thread; - Params.Layout = LinalgMatrixLayout::OuterProductOptimal; + Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMulAdd(D3DDevice, DxcSupport, Params, VerboseLogging, @@ -1326,8 +1315,14 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { } static const char OuterProductShader[] = R"( - #define USE_A 0 + // OuterProduct Matrix must be Thread scope #define SCOPE_THREAD 0 + // OuterProduct/Accumulate must be Accumulator use + #define USE_ACC 2 + // Accumulate Layout must be OuterProductOptimal + #define LAYOUT_OUTER_PROD_OPT 4 + // Accumulate Stride msut be 0 for non Row/Col Major + #define STRIDE 0 RWByteAddressBuffer Input : register(u0); RWByteAddressBuffer Output : register(u1); @@ -1347,12 +1342,12 @@ static const char OuterProductShader[] = R"( } __builtin_LinAlgMatrix - [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]] + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE_THREAD)]] Mat; __builtin_LinAlg_MatrixOuterProduct(Mat, VecA, VecB); __builtin_LinAlg_MatrixAccumulateToDescriptor( - Mat, Output, 0, STRIDE, LAYOUT, 128); + Mat, Output, 0, STRIDE, LAYOUT_OUTER_PROD_OPT, 128); } )"; @@ -1400,6 +1395,7 @@ void DxilConf_SM610_LinAlg::OuterProduct_Thread_16x16_F16() { Params.CompType = ComponentType::F16; Params.M = 16; Params.N = 16; + Params.Use = MatrixUse::Accumulator; Params.Scope = MatrixScope::Thread; Params.Layout = LinalgMatrixLayout::OuterProductOptimal; Params.NumThreads = 1;