From bb28c903773aee5f3cb02f0f4c0e22f2dc37a263 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Fri, 10 Apr 2026 17:15:58 -0600 Subject: [PATCH 01/21] [SM6.10][Exec] Implement Remaining Smoke Tests --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 271 +++++++++++++++++- 1 file changed, 260 insertions(+), 11 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index da32f553c4..0de273873a 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -307,6 +307,14 @@ class DxilConf_SM610_LinAlg { // Element access TEST_METHOD(ElementAccess_Wave_16x16_F16); + TEST_METHOD(ElementSet_Wave_16x16_F16); + + // Cast/Convert + TEST_METHOD(CopyConvert_Wave_16x16_F16); + TEST_METHOD(CopyConvert_Wave_16x16_F16_Transpose); + + // Matrix Arithmetic + TEST_METHOD(MatMatMul_Wave_16x16x16_F16); private: CComPtr D3DDevice; @@ -537,14 +545,9 @@ static void runElementAccess(ID3D12Device *Device, const MatrixParams &Params, bool Verbose) { const size_t NumElements = Params.totalElements(); const size_t NumThreads = Params.NumThreads; - const size_t InputBufSize = Params.totalBytes(); - const size_t ElementSize = elementSize(Params.CompType); - - // Output: ElementSize bytes per element - // 1 element for each mat idx - // 1 uint for each thread's length - const size_t OutputBufSize = - NumElements * ElementSize + NumThreads * sizeof(uint32_t); + const size_t MatrixSize = Params.totalBytes(); + // OutputBuf needs to fit the Matrix plus one uint per thread + const size_t OutputBufSize = MatrixSize + NumThreads * sizeof(uint32_t); std::stringstream ExtraDefs; std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); @@ -555,7 +558,7 @@ static void runElementAccess(ID3D12Device *Device, auto Op = createComputeOp(ElementAccessShader, "cs_6_10", "UAV(u0), UAV(u1)", Args.c_str()); - addUAVBuffer(Op.get(), "Input", InputBufSize, false, "byname"); + addUAVBuffer(Op.get(), "Input", MatrixSize, false, "byname"); addUAVBuffer(Op.get(), "Output", OutputBufSize, true); addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); @@ -579,9 +582,8 @@ static void runElementAccess(ID3D12Device *Device, // Verify the end of the buffer is NumThreads number of lengths, whose // sum is greater than or equal to NumElements const BYTE *Out = static_cast(OutData.data()); - size_t MatrixEndOffset = NumElements * ElementSize; const uint32_t *Lengths = - reinterpret_cast(Out + MatrixEndOffset); + reinterpret_cast(Out + MatrixSize); uint32_t TotalLength = 0; for (size_t I = 0; I < NumThreads; ++I) TotalLength += Lengths[I]; @@ -602,4 +604,251 @@ void DxilConf_SM610_LinAlg::ElementAccess_Wave_16x16_F16() { runElementAccess(D3DDevice, DxcSupport, Params, VerboseLogging); } +static const char ElementSetShader[] = R"( + RWByteAddressBuffer Input : register(u0); + RWByteAddressBuffer Output : register(u1); + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE, SCOPE)]] + Mat; + __builtin_LinAlg_MatrixLoadFromDescriptor( + Mat, Input, 0, STRIDE, LAYOUT, 128); + + // Increment every element by 5 + for (uint I = 0; I < __builtin_LinAlg_MatrixLength(Mat); ++I) { + ELEM_TYPE Elem; + __builtin_LinAlg_MatrixGetElement(Elem, Mat, I); + Elem = Elem + 5; + __builtin_LinAlg_MatrixSetElement(Mat, Mat, I, Elem); + } + + __builtin_LinAlg_MatrixStoreToDescriptor( + Mat, Output, 0, STRIDE, LAYOUT, 128); + } +)"; + +static void runElementSet(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose) { + const size_t NumElements = Params.totalElements(); + const size_t MatrixSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, ElementSetShader, "cs_6_10", Args, Verbose); + + // Start counting from 6 since each element was increased by 5 + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 6); + + auto Op = createComputeOp(ElementSetShader, "cs_6_10", "UAV(u0), UAV(u1)", + Args.c_str()); + addUAVBuffer(Op.get(), "Input", MatrixSize, false, "byname"); + addUAVBuffer(Op.get(), "Output", MatrixSize, true); + addRootUAV(Op.get(), 0, "Input"); + addRootUAV(Op.get(), 1, "Output"); + + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements), + "Saw unsupported component type"); + }); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + // Verify the front of the buffer is a list of elements of the expected type + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); + +} + +void DxilConf_SM610_LinAlg::ElementSet_Wave_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::Accumulator; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runElementSet(D3DDevice, DxcSupport, Params, VerboseLogging); +} + +static const char CopyConvertShader[] = R"( + RWByteAddressBuffer Input : register(u0); + RWByteAddressBuffer Output : register(u1); + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE, SCOPE)]] + Src; + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, N_DIM, M_DIM, USE, SCOPE)]] + Dst; + + __builtin_LinAlg_MatrixLoadFromDescriptor( + Src, Input, 0, STRIDE, LAYOUT, 128); + __builtin_LinAlg_CopyConvertMatrix(Dst, Src, TRANSPOSE); + __builtin_LinAlg_MatrixStoreToDescriptor( + Dst, Output, 0, STRIDE, LAYOUT, 128); + } +)"; + +static void runCopyConvert(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, bool Transpose) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << " -DTRANSPOSE=" << Transpose; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, CopyConvertShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1, /*Increment=*/true, Transpose); + + // Construct the ShaderOp: two UAV buffers, load from one, store to other. + auto Op = createComputeOp(CopyConvertShader, "cs_6_10", "UAV(u0), UAV(u1)", + Args.c_str()); + addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Input"); + addRootUAV(Op.get(), 1, "Output"); + + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements), + "Saw unsupported component type"); + }); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::CopyConvert_Wave_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::A; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runCopyConvert(D3DDevice, DxcSupport, Params, VerboseLogging, /*Transpose=*/false); +} + +void DxilConf_SM610_LinAlg::CopyConvert_Wave_16x16_F16_Transpose() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::A; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runCopyConvert(D3DDevice, DxcSupport, Params, VerboseLogging, /*Transpose=*/true); +} + +static const char MatMatMulShader[] = R"( + #define USE_A 0 + #define USE_B 1 + #define USE_ACC 2 + + RWByteAddressBuffer Output : register(u0); + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, K_DIM, USE_A, SCOPE)]] + MatA; + __builtin_LinAlg_FillMatrix(MatA, A_FILL); + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, K_DIM, N_DIM, USE_B, SCOPE)]] + MatB; + __builtin_LinAlg_FillMatrix(MatB, B_FILL); + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE)]] + MatC; + __builtin_LinAlg_MatrixMatrixMultiply(MatC, MatA, MatB); + + __builtin_LinAlg_MatrixStoreToDescriptor( + MatC, Output, 0, STRIDE, LAYOUT, 128); + } +)"; + +static void runMatMatMul(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, MatrixDim K, float AFill, float BFill) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << " -DK_DIM=" << K; + ExtraDefs << " -DA_FILL=" << AFill; + ExtraDefs << " -DB_FILL=" << BFill; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, MatMatMulShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, AFill * BFill * K, /*Increment=*/false); + + auto Op = + createComputeOp(MatMatMulShader, "cs_6_10", "UAV(u0)", Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::MatMatMul_Wave_16x16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, /*AFill=*/2.0f, /*BFill=*/3.0f); +} + } // namespace LinAlg From 6b0b6a507bd48288644aac960d6fe7223eb1cf15 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 10 Apr 2026 23:27:46 +0000 Subject: [PATCH 02/21] chore: autopublish 2026-04-10T23:27:46Z --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 0de273873a..1d420d1ca6 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -634,8 +634,8 @@ static const char ElementSetShader[] = R"( )"; static void runElementSet(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose) { const size_t NumElements = Params.totalElements(); const size_t MatrixSize = Params.totalBytes(); @@ -669,7 +669,6 @@ static void runElementSet(ID3D12Device *Device, // Verify the front of the buffer is a list of elements of the expected type VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), Expected, NumElements, Verbose)); - } void DxilConf_SM610_LinAlg::ElementSet_Wave_16x16_F16() { @@ -711,8 +710,9 @@ static const char CopyConvertShader[] = R"( )"; static void runCopyConvert(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose, bool Transpose) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + bool Transpose) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -723,7 +723,8 @@ static void runCopyConvert(ID3D12Device *Device, compileShader(DxcSupport, CopyConvertShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1, /*Increment=*/true, Transpose); + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1, + /*Increment=*/true, Transpose); // Construct the ShaderOp: two UAV buffers, load from one, store to other. auto Op = createComputeOp(CopyConvertShader, "cs_6_10", "UAV(u0), UAV(u1)", @@ -759,7 +760,8 @@ void DxilConf_SM610_LinAlg::CopyConvert_Wave_16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runCopyConvert(D3DDevice, DxcSupport, Params, VerboseLogging, /*Transpose=*/false); + runCopyConvert(D3DDevice, DxcSupport, Params, VerboseLogging, + /*Transpose=*/false); } void DxilConf_SM610_LinAlg::CopyConvert_Wave_16x16_F16_Transpose() { @@ -772,7 +774,8 @@ void DxilConf_SM610_LinAlg::CopyConvert_Wave_16x16_F16_Transpose() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runCopyConvert(D3DDevice, DxcSupport, Params, VerboseLogging, /*Transpose=*/true); + runCopyConvert(D3DDevice, DxcSupport, Params, VerboseLogging, + /*Transpose=*/true); } static const char MatMatMulShader[] = R"( @@ -809,8 +812,9 @@ static const char MatMatMulShader[] = R"( )"; static void runMatMatMul(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose, MatrixDim K, float AFill, float BFill) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, MatrixDim K, + float AFill, float BFill) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -823,7 +827,8 @@ static void runMatMatMul(ID3D12Device *Device, compileShader(DxcSupport, MatMatMulShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, AFill * BFill * K, /*Increment=*/false); + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + AFill * BFill * K, /*Increment=*/false); auto Op = createComputeOp(MatMatMulShader, "cs_6_10", "UAV(u0)", Args.c_str()); @@ -848,7 +853,8 @@ void DxilConf_SM610_LinAlg::MatMatMul_Wave_16x16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, /*AFill=*/2.0f, /*BFill=*/3.0f); + runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, + /*AFill=*/2.0f, /*BFill=*/3.0f); } } // namespace LinAlg From 740cd6267054b1efaa8b6d0afec19e4f6779d2dc Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 12:47:37 -0600 Subject: [PATCH 03/21] Add more tests --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 1d420d1ca6..4d055f946f 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -315,6 +315,8 @@ class DxilConf_SM610_LinAlg { // Matrix Arithmetic TEST_METHOD(MatMatMul_Wave_16x16x16_F16); + TEST_METHOD(MatMatMulAccum_Wave_16x16x16_F16); + TEST_METHOD(MatAccum_Wave_16x16_F16); private: CComPtr D3DDevice; @@ -857,4 +859,160 @@ void DxilConf_SM610_LinAlg::MatMatMul_Wave_16x16x16_F16() { /*AFill=*/2.0f, /*BFill=*/3.0f); } +static const char MatMatMulAccumShader[] = R"( + #define USE_A 0 + #define USE_B 1 + #define USE_ACC 2 + + RWByteAddressBuffer Output : register(u0); + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, K_DIM, USE_A, SCOPE)]] + MatA; + __builtin_LinAlg_FillMatrix(MatA, A_FILL); + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, K_DIM, N_DIM, USE_B, SCOPE)]] + MatB; + __builtin_LinAlg_FillMatrix(MatB, B_FILL); + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE)]] + MatC; + __builtin_LinAlg_FillMatrix(MatC, C_FILL); + + __builtin_LinAlg_MatrixMatrixMultiplyAccumulate(MatC, MatA, MatB, MatC); + + __builtin_LinAlg_MatrixStoreToDescriptor( + MatC, Output, 0, STRIDE, LAYOUT, 128); + } +)"; + +static void runMatMatMulAccum(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, MatrixDim K, + float AFill, float BFill, float CFill) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << " -DK_DIM=" << K; + ExtraDefs << " -DA_FILL=" << AFill; + ExtraDefs << " -DB_FILL=" << BFill; + ExtraDefs << " -DC_FILL=" << CFill; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, MatMatMulAccumShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + AFill * BFill * K + CFill, /*Increment=*/false); + + auto Op = + createComputeOp(MatMatMulAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::MatMatMulAccum_Wave_16x16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runMatMatMulAccum(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, + /*AFill=*/2.0f, /*BFill=*/3.0f, /*CFill=*/4.0f); +} + +static const char MatAccumShader[] = R"( + #define USE_A 0 + #define USE_ACC 2 + + RWByteAddressBuffer Output : register(u0); + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE)]] + MatLHS; + __builtin_LinAlg_FillMatrix(MatLHS, LHS_FILL); + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE)]] + MatRHS; + __builtin_LinAlg_FillMatrix(MatRHS, RHS_FILL); + + __builtin_LinAlg_MatrixAccumulate(MatLHS, MatLHS, MatRHS); + + __builtin_LinAlg_MatrixStoreToDescriptor( + MatLHS, Output, 0, STRIDE, LAYOUT, 128); + } +)"; + +static void runMatAccum(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float LHSFill, float RHSFill) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << " -DLHS_FILL=" << LHSFill; + ExtraDefs << " -DRHS_FILL=" << RHSFill; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, MatAccumShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + LHSFill + RHSFill, /*Increment=*/false); + + auto Op = + createComputeOp(MatAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::MatAccum_Wave_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runMatAccum(D3DDevice, DxcSupport, Params, VerboseLogging, + /*LHSFill=*/2.0f, /*RHSFill=*/3.0f); +} + } // namespace LinAlg From b10f6caed62233a5e5a0804640507331dc143e84 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 15:49:57 -0600 Subject: [PATCH 04/21] more tests --- .../unittests/HLSLExec/HlslExecTestUtils.cpp | 2 +- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 236 ++++++++++++++++-- 2 files changed, 215 insertions(+), 23 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp index 10dfc63b37..c5783a0161 100644 --- a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp +++ b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp @@ -751,7 +751,7 @@ void compileShader(dxc::SpecificDllLoader &DxcSupport, const char *Source, if (VerboseLogging) { hlsl_test::LogCommentFmt(L"Shader Source:"); - hlsl_test::LogCommentFmt(L"%c", Source); + hlsl_test::LogCommentFmt(std::wstring(Source, Source + strlen(Source)).c_str()); } hlsl_test::LogCommentFmt(LogFlags.str().c_str()); diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 4d055f946f..55b64c79c9 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -199,32 +199,42 @@ static bool verifyComponentBuffer(ComponentType CompType, const void *Actual, } static bool fillInputBuffer(LPCSTR Name, std::vector &Data, - ComponentType CompType, size_t NumElements) { + ComponentType CompType, size_t NumElements, + size_t StartingVal = 1, bool Increment = true) { if (_stricmp(Name, "Input") != 0) return true; switch (CompType) { - case ComponentType::F32: { - float *Ptr = reinterpret_cast(Data.data()); - for (size_t I = 0; I < NumElements; I++) - Ptr[I] = static_cast(I + 1); - return true; - } - case ComponentType::I32: { - int32_t *Ptr = reinterpret_cast(Data.data()); - for (size_t I = 0; I < NumElements; I++) - Ptr[I] = static_cast(I + 1); - return true; - } - case ComponentType::F16: { - HLSLHalf_t *Ptr = reinterpret_cast(Data.data()); - for (size_t I = 0; I < NumElements; I++) - Ptr[I] = HLSLHalf_t(static_cast(I + 1)); - return true; + case ComponentType::F32: + case ComponentType::I32: + case ComponentType::F16: + break; + default: + return false; } + + for (size_t I = 0; I < NumElements; ++I) { + size_t Value = StartingVal + (Increment ? I : 0); + switch (CompType) { + case ComponentType::F32: { + float *Ptr = reinterpret_cast(Data.data()); + Ptr[I] = static_cast(Value); + break; + } + case ComponentType::I32: { + int32_t *Ptr = reinterpret_cast(Data.data()); + Ptr[I] = static_cast(Value); + break; + } + case ComponentType::F16: { + HLSLHalf_t *Ptr = reinterpret_cast(Data.data()); + Ptr[I] = HLSLHalf_t(static_cast(Value)); + break; + } + } } - return false; + return true; } static VariantCompType makeExpected(ComponentType CompType, MatrixDim M, @@ -313,11 +323,15 @@ class DxilConf_SM610_LinAlg { TEST_METHOD(CopyConvert_Wave_16x16_F16); TEST_METHOD(CopyConvert_Wave_16x16_F16_Transpose); - // Matrix Arithmetic + // Matrix Matrix Arithmetic TEST_METHOD(MatMatMul_Wave_16x16x16_F16); TEST_METHOD(MatMatMulAccum_Wave_16x16x16_F16); TEST_METHOD(MatAccum_Wave_16x16_F16); + // Matrix Vector Arithmetic + TEST_METHOD(MatVecMul_Thread_16x16_F16); + TEST_METHOD(MatVecMulAdd_Thread_16x16_F16); + private: CComPtr D3DDevice; dxc::SpecificDllLoader DxcSupport; @@ -855,8 +869,8 @@ void DxilConf_SM610_LinAlg::MatMatMul_Wave_16x16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, - /*AFill=*/2.0f, /*BFill=*/3.0f); +// runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, +// /*AFill=*/2.0f, /*BFill=*/3.0f); } static const char MatMatMulAccumShader[] = R"( @@ -1015,4 +1029,182 @@ void DxilConf_SM610_LinAlg::MatAccum_Wave_16x16_F16() { /*LHSFill=*/2.0f, /*RHSFill=*/3.0f); } +static const char MatVecMulShader[] = R"( + #define USE_A 0 + #define SCOPE_THREAD 0 + + RWByteAddressBuffer Input : register(u0); + RWByteAddressBuffer Output : register(u1); + + [numthreads(NUMTHREADS, 1, 1)] + void main() { + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]] + Mat; + __builtin_LinAlg_FillMatrix(Mat, MAT_FILL); + + vector InVec; + for (uint I = 0; I < M_DIM; ++I) { + InVec[I] = Input.Load(I * ELEM_SIZE); + } + + vector OutVec; + __builtin_LinAlg_MatrixVectorMultiply( + OutVec, Mat, OUTPUT_SIGNED, InVec, IN_INTERP); + + for (uint I = 0; I < M_DIM; ++I) { + Output.Store(I * ELEM_SIZE, OutVec[I]); + } + } +)"; + +static void runMatVecMul(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float MatFill, bool OutputSigned, ComponentType InputInterp) { + const size_t NumElements = Params.M; + const size_t BufferSize = elementSize(Params.CompType) * NumElements; + + std::stringstream ExtraDefs; + ExtraDefs << " -DMAT_FILL=" << MatFill; + ExtraDefs << " -DOUTPUT_SIGNED=" << OutputSigned; + ExtraDefs << " -DIN_INTERP=" << static_cast(InputInterp); + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, MatVecMulShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpected(Params.CompType, Params.M, 1, + MatFill * Params.N, /*Increment=*/false); + + auto Op = createComputeOp(MatVecMulShader, "cs_6_10", "UAV(u0), UAV(u1)", + Args.c_str()); + addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Input"); + addRootUAV(Op.get(), 1, "Output"); + + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements, /*StartingVal=*/1, /*Increment=*/false), + "Saw unsupported component type"); + }); + + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::MatVecMul_Thread_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Scope = MatrixScope::Thread; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 1; + Params.Enable16Bit = true; + runMatVecMul(D3DDevice, DxcSupport, Params, VerboseLogging, + /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16); +} + +static const char MatVecMulAddShader[] = R"( + #define USE_A 0 + #define SCOPE_THREAD 0 + + RWByteAddressBuffer Input : register(u0); + RWByteAddressBuffer Output : register(u1); + + [numthreads(NUMTHREADS, 1, 1)] + void main() { + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]] + Mat; + __builtin_LinAlg_FillMatrix(Mat, MAT_FILL); + + vector InVec; + for (uint I = 0; I < M_DIM; ++I) { + InVec[I] = Input.Load(I * ELEM_SIZE); + } + + // TODO: this is just copying InVec but it should be a unique value + vector BiasVec; + for (uint I = 0; I < M_DIM; ++I) { + BiasVec[I] = Input.Load(I * ELEM_SIZE); + } + + vector OutVec; + __builtin_LinAlg_MatrixVectorMultiplyAdd( + OutVec, Mat, OUTPUT_SIGNED, InVec, IN_INTERP, BiasVec, BIAS_INTERP); + + for (uint I = 0; I < M_DIM; ++I) { + Output.Store(I * ELEM_SIZE, OutVec[I]); + } + } +)"; + +static void runMatVecMulAdd(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float MatFill, bool OutputSigned, ComponentType InputInterp, + ComponentType BiasInterp) { + const size_t NumElements = Params.M; + const size_t BufferSize = elementSize(Params.CompType) * NumElements; + + std::stringstream ExtraDefs; + ExtraDefs << " -DMAT_FILL=" << MatFill; + ExtraDefs << " -DOUTPUT_SIGNED=" << OutputSigned; + ExtraDefs << " -DIN_INTERP=" << static_cast(InputInterp); + ExtraDefs << " -DBIAS_INTERP=" << static_cast(BiasInterp); + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, MatVecMulAddShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpected(Params.CompType, Params.M, 1, + MatFill * Params.N + 1, /*Increment=*/false); + + auto Op = createComputeOp(MatVecMulAddShader, "cs_6_10", "UAV(u0), UAV(u1)", + Args.c_str()); + addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Input"); + addRootUAV(Op.get(), 1, "Output"); + + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements, /*StartingVal=*/1, /*Increment=*/false), + "Saw unsupported component type"); + }); + + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Scope = MatrixScope::Thread; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 1; + Params.Enable16Bit = true; + runMatVecMulAdd(D3DDevice, DxcSupport, Params, VerboseLogging, + /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16, ComponentType::F16); +} + } // namespace LinAlg From 44f62c3e4b8fcb7caba3b2eb194d20c054d8d567 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 15:57:43 -0600 Subject: [PATCH 05/21] small rename --- tools/clang/unittests/HLSLExec/LinAlgTests.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 55b64c79c9..5b15ba624e 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -310,7 +310,7 @@ class DxilConf_SM610_LinAlg { TEST_METHOD_SETUP(setupMethod); // Load/Store - TEST_METHOD(LoadStoreRoundtrip_Wave_16x16_F16); + TEST_METHOD(LoadStoreDescriptor_Wave_16x16_F16); // Splat Store TEST_METHOD(SplatStore_Wave_16x16_F16); @@ -382,7 +382,7 @@ bool DxilConf_SM610_LinAlg::setupMethod() { return D3D12SDK->createDevice(&D3DDevice, D3D_SHADER_MODEL_6_10, false); } -static const char LoadStoreShader[] = R"( +static const char LoadStoreDescriptorShader[] = R"( RWByteAddressBuffer Input : register(u0); RWByteAddressBuffer Output : register(u1); @@ -402,7 +402,7 @@ static const char LoadStoreShader[] = R"( } )"; -static void runLoadStoreRoundtrip(ID3D12Device *Device, +static void runLoadStoreDescriptor(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, const MatrixParams &Params, bool Verbose) { const size_t NumElements = Params.totalElements(); @@ -414,12 +414,12 @@ static void runLoadStoreRoundtrip(ID3D12Device *Device, std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); - compileShader(DxcSupport, LoadStoreShader, "cs_6_10", Args, Verbose); + compileShader(DxcSupport, LoadStoreDescriptorShader, "cs_6_10", Args, Verbose); auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1); // Construct the ShaderOp: two UAV buffers, load from one, store to other. - auto Op = createComputeOp(LoadStoreShader, "cs_6_10", "UAV(u0), UAV(u1)", + auto Op = createComputeOp(LoadStoreDescriptorShader, "cs_6_10", "UAV(u0), UAV(u1)", Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); @@ -442,7 +442,7 @@ static void runLoadStoreRoundtrip(ID3D12Device *Device, Expected, NumElements, Verbose)); } -void DxilConf_SM610_LinAlg::LoadStoreRoundtrip_Wave_16x16_F16() { +void DxilConf_SM610_LinAlg::LoadStoreDescriptor_Wave_16x16_F16() { MatrixParams Params = {}; Params.CompType = ComponentType::F16; Params.M = 16; @@ -452,7 +452,7 @@ void DxilConf_SM610_LinAlg::LoadStoreRoundtrip_Wave_16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runLoadStoreRoundtrip(D3DDevice, DxcSupport, Params, VerboseLogging); + runLoadStoreDescriptor(D3DDevice, DxcSupport, Params, VerboseLogging); } static const char SplatStoreShader[] = R"( From 8f45a0ce9bbf268b2474052fbd8c5fb758263b91 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 13 Apr 2026 22:04:58 +0000 Subject: [PATCH 06/21] chore: autopublish 2026-04-13T22:04:58Z --- .../unittests/HLSLExec/HlslExecTestUtils.cpp | 3 +- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 82 ++++++++++--------- 2 files changed, 44 insertions(+), 41 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp index c5783a0161..d2c3cac0b2 100644 --- a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp +++ b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp @@ -751,7 +751,8 @@ void compileShader(dxc::SpecificDllLoader &DxcSupport, const char *Source, if (VerboseLogging) { hlsl_test::LogCommentFmt(L"Shader Source:"); - hlsl_test::LogCommentFmt(std::wstring(Source, Source + strlen(Source)).c_str()); + hlsl_test::LogCommentFmt( + std::wstring(Source, Source + strlen(Source)).c_str()); } hlsl_test::LogCommentFmt(LogFlags.str().c_str()); diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 5b15ba624e..e2456737fe 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -403,8 +403,8 @@ static const char LoadStoreDescriptorShader[] = R"( )"; static void runLoadStoreDescriptor(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -414,13 +414,14 @@ static void runLoadStoreDescriptor(ID3D12Device *Device, std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); - compileShader(DxcSupport, LoadStoreDescriptorShader, "cs_6_10", Args, Verbose); + compileShader(DxcSupport, LoadStoreDescriptorShader, "cs_6_10", Args, + Verbose); auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1); // Construct the ShaderOp: two UAV buffers, load from one, store to other. - auto Op = createComputeOp(LoadStoreDescriptorShader, "cs_6_10", "UAV(u0), UAV(u1)", - Args.c_str()); + auto Op = createComputeOp(LoadStoreDescriptorShader, "cs_6_10", + "UAV(u0), UAV(u1)", Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Input"); @@ -869,8 +870,8 @@ void DxilConf_SM610_LinAlg::MatMatMul_Wave_16x16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; -// runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, -// /*AFill=*/2.0f, /*BFill=*/3.0f); + // runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, + // /*AFill=*/2.0f, /*BFill=*/3.0f); } static const char MatMatMulAccumShader[] = R"( @@ -910,8 +911,9 @@ static const char MatMatMulAccumShader[] = R"( static void runMatMatMulAccum(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose, MatrixDim K, - float AFill, float BFill, float CFill) { + const MatrixParams &Params, bool Verbose, + MatrixDim K, float AFill, float BFill, + float CFill) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -986,8 +988,8 @@ static const char MatAccumShader[] = R"( static void runMatAccum(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose, - float LHSFill, float RHSFill) { + const MatrixParams &Params, bool Verbose, float LHSFill, + float RHSFill) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -1002,8 +1004,7 @@ static void runMatAccum(ID3D12Device *Device, auto Expected = makeExpected(Params.CompType, Params.M, Params.N, LHSFill + RHSFill, /*Increment=*/false); - auto Op = - createComputeOp(MatAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); + auto Op = createComputeOp(MatAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Output"); @@ -1061,7 +1062,8 @@ static const char MatVecMulShader[] = R"( static void runMatVecMul(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, const MatrixParams &Params, bool Verbose, - float MatFill, bool OutputSigned, ComponentType InputInterp) { + float MatFill, bool OutputSigned, + ComponentType InputInterp) { const size_t NumElements = Params.M; const size_t BufferSize = elementSize(Params.CompType) * NumElements; @@ -1074,8 +1076,8 @@ static void runMatVecMul(ID3D12Device *Device, compileShader(DxcSupport, MatVecMulShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, 1, - MatFill * Params.N, /*Increment=*/false); + auto Expected = makeExpected(Params.CompType, Params.M, 1, MatFill * Params.N, + /*Increment=*/false); auto Op = createComputeOp(MatVecMulShader, "cs_6_10", "UAV(u0), UAV(u1)", Args.c_str()); @@ -1084,15 +1086,14 @@ static void runMatVecMul(ID3D12Device *Device, addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = - runShaderOp(Device, DxcSupport, std::move(Op), - [NumElements, Params](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, - NumElements, /*StartingVal=*/1, /*Increment=*/false), - "Saw unsupported component type"); - }); - + auto Result = runShaderOp( + Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumElements, + /*StartingVal=*/1, /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); @@ -1111,7 +1112,7 @@ void DxilConf_SM610_LinAlg::MatVecMul_Thread_16x16_F16() { Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMul(D3DDevice, DxcSupport, Params, VerboseLogging, - /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16); + /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16); } static const char MatVecMulAddShader[] = R"( @@ -1150,10 +1151,11 @@ static const char MatVecMulAddShader[] = R"( )"; static void runMatVecMulAdd(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose, - float MatFill, bool OutputSigned, ComponentType InputInterp, - ComponentType BiasInterp) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float MatFill, bool OutputSigned, + ComponentType InputInterp, + ComponentType BiasInterp) { const size_t NumElements = Params.M; const size_t BufferSize = elementSize(Params.CompType) * NumElements; @@ -1177,15 +1179,14 @@ static void runMatVecMulAdd(ID3D12Device *Device, addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = - runShaderOp(Device, DxcSupport, std::move(Op), - [NumElements, Params](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, - NumElements, /*StartingVal=*/1, /*Increment=*/false), - "Saw unsupported component type"); - }); - + auto Result = runShaderOp( + Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumElements, + /*StartingVal=*/1, /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); @@ -1204,7 +1205,8 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMulAdd(D3DDevice, DxcSupport, Params, VerboseLogging, - /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16, ComponentType::F16); + /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16, + ComponentType::F16); } } // namespace LinAlg From be2435360ecce7bc30cff4f710460b3436921ebb Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 16:42:32 -0600 Subject: [PATCH 07/21] outer product --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 86 ++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index e2456737fe..63a32ce34b 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -331,6 +331,7 @@ class DxilConf_SM610_LinAlg { // Matrix Vector Arithmetic TEST_METHOD(MatVecMul_Thread_16x16_F16); TEST_METHOD(MatVecMulAdd_Thread_16x16_F16); + TEST_METHOD(OuterProduct_Thread_16x16_F16); private: CComPtr D3DDevice; @@ -870,8 +871,8 @@ void DxilConf_SM610_LinAlg::MatMatMul_Wave_16x16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - // runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, - // /*AFill=*/2.0f, /*BFill=*/3.0f); + runMatMatMul(D3DDevice, DxcSupport, Params, VerboseLogging, /*K=*/16, + /*AFill=*/2.0f, /*BFill=*/3.0f); } static const char MatMatMulAccumShader[] = R"( @@ -1209,4 +1210,85 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { ComponentType::F16); } +static const char OuterProductShader[] = R"( + #define USE_A 0 + #define SCOPE_THREAD 0 + + RWByteAddressBuffer Input : register(u0); + RWByteAddressBuffer Output : register(u1); + + [numthreads(NUMTHREADS, 1, 1)] + void main() { + vector VecA; + for (uint I = 0; I < M_DIM; ++I) { + VecA[I] = Input.Load(I * ELEM_SIZE); + } + + uint EndVecA = M_DIM * ELEM_SIZE; + + vector VecB; + for (uint I = 0; I < N_DIM; ++I) { + VecB[I] = Input.Load(EndVecA + I * ELEM_SIZE); + } + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]] + Mat; + __builtin_LinAlg_MatrixOuterProduct(Mat, VecA, VecB); + + __builtin_LinAlg_MatrixStoreToDescriptor( + Mat, Output, 0, STRIDE, LAYOUT, 128); + } +)"; + +static void runOuterProduct(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose) { + const size_t NumVecElements = Params.M + Params.N; + const size_t InBuffSize = NumVecElements * elementSize(Params.CompType); + const size_t NumMatElements = Params.totalElements(); + const size_t OutBufferSize = Params.totalBytes(); + + std::string Args = buildCompilerArgs(Params); + + compileShader(DxcSupport, OuterProductShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + 4, /*Increment=*/false); + + auto Op = createComputeOp(OuterProductShader, "cs_6_10", "UAV(u0), UAV(u1)", + Args.c_str()); + addUAVBuffer(Op.get(), "Input", InBuffSize, false, "byname"); + addUAVBuffer(Op.get(), "Output", OutBufferSize, true); + addRootUAV(Op.get(), 0, "Input"); + addRootUAV(Op.get(), 1, "Output"); + + auto Result = runShaderOp( + Device, DxcSupport, std::move(Op), + [NumVecElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumVecElements, + /*StartingVal=*/2, /*Increment=*/false), + "Saw unsupported component type"); + }); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumMatElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::OuterProduct_Thread_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Scope = MatrixScope::Thread; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 1; + Params.Enable16Bit = true; + runOuterProduct(D3DDevice, DxcSupport, Params, VerboseLogging); +} + } // namespace LinAlg From c5a1f67455f027288f355d497e824e602425038c Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 17:01:17 -0600 Subject: [PATCH 08/21] QueryAccumulatorLayout --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 63a32ce34b..2c3d1fef12 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -333,6 +333,9 @@ class DxilConf_SM610_LinAlg { TEST_METHOD(MatVecMulAdd_Thread_16x16_F16); TEST_METHOD(OuterProduct_Thread_16x16_F16); + // Query Accumulator Layout + TEST_METHOD(QueryAccumLayout); + private: CComPtr D3DDevice; dxc::SpecificDllLoader DxcSupport; @@ -1291,4 +1294,43 @@ void DxilConf_SM610_LinAlg::OuterProduct_Thread_16x16_F16() { runOuterProduct(D3DDevice, DxcSupport, Params, VerboseLogging); } +static const char QueryAccumLayoutShader[] = R"( + RWByteAddressBuffer Output : register(u0); + + [numthreads(1, 1, 1)] + void main() { + uint Layout = __builtin_LinAlg_MatrixQueryAccumulatorLayout(); + Output.Store(0, Layout); + } +)"; + +static void runQueryAccumLayout(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + bool Verbose) { + std::string Args = "-HV 202x"; + size_t BufferSize = elementSize(ComponentType::I32); + + compileShader(DxcSupport, QueryAccumLayoutShader, "cs_6_10", Args, Verbose); + + auto Op = + createComputeOp(QueryAccumLayoutShader, "cs_6_10", "UAV(u0)", Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + const uint32_t *Out = static_cast(OutData.data()); + + // Accum Layout must be A or B + VERIFY_IS_TRUE(Out[0] == static_cast(MatrixUse::A) || Out[0] == static_cast(MatrixUse::B)); + if (Verbose) + hlsl_test::LogCommentFmt(L"AccumulatorLayout = %u", Out[0]); +} + +void DxilConf_SM610_LinAlg::QueryAccumLayout() { + runQueryAccumLayout(D3DDevice, DxcSupport, VerboseLogging); +} + } // namespace LinAlg From 70bd8dcf1a2a2854cc8dbf83ab38aae2a962b3e4 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 17:25:46 -0600 Subject: [PATCH 09/21] Fix test oob crash --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 2c3d1fef12..cd3fe6035c 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -237,7 +237,7 @@ static bool fillInputBuffer(LPCSTR Name, std::vector &Data, return true; } -static VariantCompType makeExpected(ComponentType CompType, MatrixDim M, +static VariantCompType makeExpectedMat(ComponentType CompType, MatrixDim M, MatrixDim N, float StartingVal, bool Increment = true, bool Transpose = false) { @@ -291,6 +291,12 @@ static VariantCompType makeExpected(ComponentType CompType, MatrixDim M, } } +static VariantCompType makeExpectedVec(ComponentType CompType, MatrixDim NumElements, + float StartingVal, + bool Increment = true) { + return makeExpectedMat(CompType, 1, NumElements, StartingVal, Increment, false); +} + class DxilConf_SM610_LinAlg { public: BEGIN_TEST_CLASS(DxilConf_SM610_LinAlg) @@ -421,7 +427,7 @@ static void runLoadStoreDescriptor(ID3D12Device *Device, compileShader(DxcSupport, LoadStoreDescriptorShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1); + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 1); // Construct the ShaderOp: two UAV buffers, load from one, store to other. auto Op = createComputeOp(LoadStoreDescriptorShader, "cs_6_10", @@ -493,7 +499,7 @@ static void runSplatStore(ID3D12Device *Device, compileShader(DxcSupport, SplatStoreShader, "cs_6_10", Args, Verbose); auto Expected = - makeExpected(Params.CompType, Params.M, Params.N, FillValue, false); + makeExpectedMat(Params.CompType, Params.M, Params.N, FillValue, false); auto Op = createComputeOp(SplatStoreShader, "cs_6_10", "UAV(u0)", Args.c_str()); @@ -575,7 +581,7 @@ static void runElementAccess(ID3D12Device *Device, compileShader(DxcSupport, ElementAccessShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1); + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 1); auto Op = createComputeOp(ElementAccessShader, "cs_6_10", "UAV(u0), UAV(u1)", Args.c_str()); @@ -666,7 +672,7 @@ static void runElementSet(ID3D12Device *Device, compileShader(DxcSupport, ElementSetShader, "cs_6_10", Args, Verbose); // Start counting from 6 since each element was increased by 5 - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 6); + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 6); auto Op = createComputeOp(ElementSetShader, "cs_6_10", "UAV(u0), UAV(u1)", Args.c_str()); @@ -744,7 +750,7 @@ static void runCopyConvert(ID3D12Device *Device, compileShader(DxcSupport, CopyConvertShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1, + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 1, /*Increment=*/true, Transpose); // Construct the ShaderOp: two UAV buffers, load from one, store to other. @@ -848,7 +854,7 @@ static void runMatMatMul(ID3D12Device *Device, compileShader(DxcSupport, MatMatMulShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, AFill * BFill * K, /*Increment=*/false); auto Op = @@ -931,7 +937,7 @@ static void runMatMatMulAccum(ID3D12Device *Device, compileShader(DxcSupport, MatMatMulAccumShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, AFill * BFill * K + CFill, /*Increment=*/false); auto Op = @@ -1005,7 +1011,7 @@ static void runMatAccum(ID3D12Device *Device, compileShader(DxcSupport, MatAccumShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, LHSFill + RHSFill, /*Increment=*/false); auto Op = createComputeOp(MatAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); @@ -1080,7 +1086,7 @@ static void runMatVecMul(ID3D12Device *Device, compileShader(DxcSupport, MatVecMulShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, 1, MatFill * Params.N, + auto Expected = makeExpectedVec(Params.CompType, Params.M, MatFill * Params.N, /*Increment=*/false); auto Op = createComputeOp(MatVecMulShader, "cs_6_10", "UAV(u0), UAV(u1)", @@ -1173,7 +1179,7 @@ static void runMatVecMulAdd(ID3D12Device *Device, compileShader(DxcSupport, MatVecMulAddShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, 1, + auto Expected = makeExpectedVec(Params.CompType, Params.M, MatFill * Params.N + 1, /*Increment=*/false); auto Op = createComputeOp(MatVecMulAddShader, "cs_6_10", "UAV(u0), UAV(u1)", @@ -1256,7 +1262,7 @@ static void runOuterProduct(ID3D12Device *Device, compileShader(DxcSupport, OuterProductShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpected(Params.CompType, Params.M, Params.N, + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 4, /*Increment=*/false); auto Op = createComputeOp(OuterProductShader, "cs_6_10", "UAV(u0), UAV(u1)", From 40ad3713db4ab393ec6b1acf50464a69daad16de Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 17:41:59 -0600 Subject: [PATCH 10/21] store not allowed on thread --- tools/clang/unittests/HLSLExec/LinAlgTests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index cd3fe6035c..62c02715cd 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -1245,7 +1245,7 @@ static const char OuterProductShader[] = R"( Mat; __builtin_LinAlg_MatrixOuterProduct(Mat, VecA, VecB); - __builtin_LinAlg_MatrixStoreToDescriptor( + __builtin_LinAlg_MatrixAccumulateToDescriptor( Mat, Output, 0, STRIDE, LAYOUT, 128); } )"; From 7ce1fe28360f009e1bbbef6aebf5ec7aef6750ee Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Mon, 13 Apr 2026 18:03:45 -0600 Subject: [PATCH 11/21] AccumulateToDescriptor --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 82 ++++++++++++++++++- 1 file changed, 79 insertions(+), 3 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 62c02715cd..b276d823dd 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -315,11 +315,11 @@ class DxilConf_SM610_LinAlg { TEST_CLASS_SETUP(setupClass); TEST_METHOD_SETUP(setupMethod); - // Load/Store + // Load/Store/Accumulate Descriptor TEST_METHOD(LoadStoreDescriptor_Wave_16x16_F16); - - // Splat Store TEST_METHOD(SplatStore_Wave_16x16_F16); + TEST_METHOD(AccumulateDescriptor_Wave_16x16_F16); + TEST_METHOD(AccumulateDescriptor_Thread_16x16_F16); // Element access TEST_METHOD(ElementAccess_Wave_16x16_F16); @@ -528,6 +528,82 @@ void DxilConf_SM610_LinAlg::SplatStore_Wave_16x16_F16() { runSplatStore(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging); } +static const char AccumulateDescriptorShader[] = R"( + #define USE_ACC 2 + RWByteAddressBuffer Output : register(u0); + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE)]] + Mat; + __builtin_LinAlg_FillMatrix(Mat, FILL_VALUE); + __builtin_LinAlg_MatrixAccumulateToDescriptor( + Mat, Output, 0, STRIDE, LAYOUT, 128); + } +)"; + +static void runAccumulateDescriptor(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, float FillValue, + bool Verbose) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << "-DFILL_VALUE=" << FillValue; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, AccumulateDescriptorShader, "cs_6_10", Args, Verbose); + + auto Expected = + makeExpectedMat(Params.CompType, Params.M, Params.N, FillValue, false); + + auto Op = + createComputeOp(AccumulateDescriptorShader, "cs_6_10", "UAV(u0)", Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::AccumulateDescriptor_Wave_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::Accumulator; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging); +} + +void DxilConf_SM610_LinAlg::AccumulateDescriptor_Thread_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::Accumulator; + Params.Scope = MatrixScope::Thread; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 1; + Params.Enable16Bit = true; + runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging); +} + static const char ElementAccessShader[] = R"( RWByteAddressBuffer Input : register(u0); RWByteAddressBuffer Output : register(u1); From e752af99749a4425d6c04dabe7d461583d367aa5 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 12:06:50 -0600 Subject: [PATCH 12/21] Fix thread shader to use SRV and Load --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index b276d823dd..708924d2de 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -530,7 +530,9 @@ void DxilConf_SM610_LinAlg::SplatStore_Wave_16x16_F16() { static const char AccumulateDescriptorShader[] = R"( #define USE_ACC 2 - RWByteAddressBuffer Output : register(u0); + + ByteAddressBuffer Input : register(t0); + RWByteAddressBuffer Output : register(u1); [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] @@ -541,7 +543,10 @@ static const char AccumulateDescriptorShader[] = R"( __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE)]] Mat; - __builtin_LinAlg_FillMatrix(Mat, FILL_VALUE); + __builtin_LinAlg_MatrixLoadFromDescriptor( + Mat, Input, 0, STRIDE, LAYOUT, 128); + __builtin_LinAlg_MatrixAccumulateToDescriptor( + Mat, Output, 0, STRIDE, LAYOUT, 128); __builtin_LinAlg_MatrixAccumulateToDescriptor( Mat, Output, 0, STRIDE, LAYOUT, 128); } @@ -549,27 +554,33 @@ static const char AccumulateDescriptorShader[] = R"( static void runAccumulateDescriptor(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, float FillValue, + const MatrixParams &Params, int FillValue, bool Verbose) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); - std::stringstream ExtraDefs; - ExtraDefs << "-DFILL_VALUE=" << FillValue; - - std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + std::string Args = buildCompilerArgs(Params); compileShader(DxcSupport, AccumulateDescriptorShader, "cs_6_10", Args, Verbose); auto Expected = - makeExpectedMat(Params.CompType, Params.M, Params.N, FillValue, false); + makeExpectedMat(Params.CompType, Params.M, Params.N, static_cast(FillValue) * 2, false); - auto Op = - createComputeOp(AccumulateDescriptorShader, "cs_6_10", "UAV(u0)", Args.c_str()); + auto Op = createComputeOp(AccumulateDescriptorShader, "cs_6_10", + "SRV(t0), UAV(u1)", Args.c_str()); + addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootUAV(Op.get(), 0, "Input"); + addRootUAV(Op.get(), 1, "Output"); - auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements, /*StartingVal=*/ FillValue, /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); @@ -588,7 +599,7 @@ void DxilConf_SM610_LinAlg::AccumulateDescriptor_Wave_16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging); + runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 12, VerboseLogging); } void DxilConf_SM610_LinAlg::AccumulateDescriptor_Thread_16x16_F16() { @@ -601,7 +612,7 @@ void DxilConf_SM610_LinAlg::AccumulateDescriptor_Thread_16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 1; Params.Enable16Bit = true; - runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging); + runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 19, VerboseLogging); } static const char ElementAccessShader[] = R"( From 3e6b53487a68a6225a96c7a5a085e1617d887999 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 13:22:22 -0600 Subject: [PATCH 13/21] fix the matvecmul tests --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 79 +++++++++---------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 708924d2de..0eaa4ba008 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -1131,7 +1131,7 @@ static const char MatVecMulShader[] = R"( #define USE_A 0 #define SCOPE_THREAD 0 - RWByteAddressBuffer Input : register(u0); + ByteAddressBuffer Input : register(t0); RWByteAddressBuffer Output : register(u1); [numthreads(NUMTHREADS, 1, 1)] @@ -1139,7 +1139,8 @@ static const char MatVecMulShader[] = R"( __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]] Mat; - __builtin_LinAlg_FillMatrix(Mat, MAT_FILL); + __builtin_LinAlg_MatrixLoadFromDescriptor( + Mat, Input, 0, STRIDE, LAYOUT, 128); vector InVec; for (uint I = 0; I < M_DIM; ++I) { @@ -1159,13 +1160,12 @@ static const char MatVecMulShader[] = R"( static void runMatVecMul(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, const MatrixParams &Params, bool Verbose, - float MatFill, bool OutputSigned, + int FillValue, bool OutputSigned, ComponentType InputInterp) { - const size_t NumElements = Params.M; - const size_t BufferSize = elementSize(Params.CompType) * NumElements; + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); std::stringstream ExtraDefs; - ExtraDefs << " -DMAT_FILL=" << MatFill; ExtraDefs << " -DOUTPUT_SIGNED=" << OutputSigned; ExtraDefs << " -DIN_INTERP=" << static_cast(InputInterp); @@ -1173,30 +1173,30 @@ static void runMatVecMul(ID3D12Device *Device, compileShader(DxcSupport, MatVecMulShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpectedVec(Params.CompType, Params.M, MatFill * Params.N, - /*Increment=*/false); + auto Expected = makeExpectedVec(Params.CompType, Params.M, + static_cast(FillValue * FillValue * Params.N), /*Increment=*/false); - auto Op = createComputeOp(MatVecMulShader, "cs_6_10", "UAV(u0), UAV(u1)", - Args.c_str()); + auto Op = createComputeOp(MatVecMulShader, "cs_6_10", + "SRV(t0), UAV(u1)", Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = runShaderOp( - Device, DxcSupport, std::move(Op), - [NumElements, Params](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumElements, - /*StartingVal=*/1, /*Increment=*/false), - "Saw unsupported component type"); - }); + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements, /*StartingVal=*/ FillValue, /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), - Expected, NumElements, Verbose)); + Expected, Params.M, Verbose)); } void DxilConf_SM610_LinAlg::MatVecMul_Thread_16x16_F16() { @@ -1209,14 +1209,14 @@ void DxilConf_SM610_LinAlg::MatVecMul_Thread_16x16_F16() { Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMul(D3DDevice, DxcSupport, Params, VerboseLogging, - /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16); + /*FillValue=*/2, /*OutputSigned=*/true, ComponentType::F16); } static const char MatVecMulAddShader[] = R"( #define USE_A 0 #define SCOPE_THREAD 0 - RWByteAddressBuffer Input : register(u0); + ByteAddressBuffer Input : register(t0); RWByteAddressBuffer Output : register(u1); [numthreads(NUMTHREADS, 1, 1)] @@ -1224,14 +1224,14 @@ static const char MatVecMulAddShader[] = R"( __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]] Mat; - __builtin_LinAlg_FillMatrix(Mat, MAT_FILL); + __builtin_LinAlg_MatrixLoadFromDescriptor( + Mat, Input, 0, STRIDE, LAYOUT, 128); vector InVec; for (uint I = 0; I < M_DIM; ++I) { InVec[I] = Input.Load(I * ELEM_SIZE); } - // TODO: this is just copying InVec but it should be a unique value vector BiasVec; for (uint I = 0; I < M_DIM; ++I) { BiasVec[I] = Input.Load(I * ELEM_SIZE); @@ -1250,14 +1250,13 @@ static const char MatVecMulAddShader[] = R"( static void runMatVecMulAdd(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, const MatrixParams &Params, bool Verbose, - float MatFill, bool OutputSigned, + int FillValue, bool OutputSigned, ComponentType InputInterp, ComponentType BiasInterp) { - const size_t NumElements = Params.M; - const size_t BufferSize = elementSize(Params.CompType) * NumElements; + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); std::stringstream ExtraDefs; - ExtraDefs << " -DMAT_FILL=" << MatFill; ExtraDefs << " -DOUTPUT_SIGNED=" << OutputSigned; ExtraDefs << " -DIN_INTERP=" << static_cast(InputInterp); ExtraDefs << " -DBIAS_INTERP=" << static_cast(BiasInterp); @@ -1267,29 +1266,29 @@ static void runMatVecMulAdd(ID3D12Device *Device, compileShader(DxcSupport, MatVecMulAddShader, "cs_6_10", Args, Verbose); auto Expected = makeExpectedVec(Params.CompType, Params.M, - MatFill * Params.N + 1, /*Increment=*/false); + static_cast(FillValue * FillValue * Params.N + FillValue), /*Increment=*/false); - auto Op = createComputeOp(MatVecMulAddShader, "cs_6_10", "UAV(u0), UAV(u1)", - Args.c_str()); + auto Op = createComputeOp(MatVecMulAddShader, "cs_6_10", + "SRV(t0), UAV(u1)", Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = runShaderOp( - Device, DxcSupport, std::move(Op), - [NumElements, Params](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumElements, - /*StartingVal=*/1, /*Increment=*/false), - "Saw unsupported component type"); - }); + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements, /*StartingVal=*/ FillValue, /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), - Expected, NumElements, Verbose)); + Expected, Params.M, Verbose)); } void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { @@ -1302,7 +1301,7 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() { Params.NumThreads = 1; Params.Enable16Bit = true; runMatVecMulAdd(D3DDevice, DxcSupport, Params, VerboseLogging, - /*MatFill=*/2.0f, /*OutputSigned=*/true, ComponentType::F16, + /*FillValue=*/2, /*OutputSigned=*/true, ComponentType::F16, ComponentType::F16); } From be6102a4fa5144437818ac2ec1e87ce0e47c4a04 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 14:41:52 -0600 Subject: [PATCH 14/21] format --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 130 ++++++++++-------- 1 file changed, 71 insertions(+), 59 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 0eaa4ba008..e3a8ac020f 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -238,9 +238,9 @@ static bool fillInputBuffer(LPCSTR Name, std::vector &Data, } static VariantCompType makeExpectedMat(ComponentType CompType, MatrixDim M, - MatrixDim N, float StartingVal, - bool Increment = true, - bool Transpose = false) { + MatrixDim N, float StartingVal, + bool Increment = true, + bool Transpose = false) { const size_t NumElements = M * N; std::vector Floats(NumElements); std::vector Ints(NumElements); @@ -291,10 +291,11 @@ static VariantCompType makeExpectedMat(ComponentType CompType, MatrixDim M, } } -static VariantCompType makeExpectedVec(ComponentType CompType, MatrixDim NumElements, - float StartingVal, - bool Increment = true) { - return makeExpectedMat(CompType, 1, NumElements, StartingVal, Increment, false); +static VariantCompType makeExpectedVec(ComponentType CompType, + MatrixDim NumElements, float StartingVal, + bool Increment = true) { + return makeExpectedMat(CompType, 1, NumElements, StartingVal, Increment, + false); } class DxilConf_SM610_LinAlg { @@ -553,18 +554,19 @@ static const char AccumulateDescriptorShader[] = R"( )"; static void runAccumulateDescriptor(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, int FillValue, - bool Verbose) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, int FillValue, + bool Verbose) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); std::string Args = buildCompilerArgs(Params); - compileShader(DxcSupport, AccumulateDescriptorShader, "cs_6_10", Args, Verbose); + compileShader(DxcSupport, AccumulateDescriptorShader, "cs_6_10", Args, + Verbose); - auto Expected = - makeExpectedMat(Params.CompType, Params.M, Params.N, static_cast(FillValue) * 2, false); + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, + static_cast(FillValue) * 2, false); auto Op = createComputeOp(AccumulateDescriptorShader, "cs_6_10", "SRV(t0), UAV(u1)", Args.c_str()); @@ -573,14 +575,15 @@ static void runAccumulateDescriptor(ID3D12Device *Device, addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = - runShaderOp(Device, DxcSupport, std::move(Op), - [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, - NumElements, /*StartingVal=*/ FillValue, /*Increment=*/false), - "Saw unsupported component type"); - }); + auto Result = runShaderOp( + Device, DxcSupport, std::move(Op), + [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumElements, + /*StartingVal=*/FillValue, + /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); @@ -838,7 +841,7 @@ static void runCopyConvert(ID3D12Device *Device, compileShader(DxcSupport, CopyConvertShader, "cs_6_10", Args, Verbose); auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 1, - /*Increment=*/true, Transpose); + /*Increment=*/true, Transpose); // Construct the ShaderOp: two UAV buffers, load from one, store to other. auto Op = createComputeOp(CopyConvertShader, "cs_6_10", "UAV(u0), UAV(u1)", @@ -942,7 +945,7 @@ static void runMatMatMul(ID3D12Device *Device, compileShader(DxcSupport, MatMatMulShader, "cs_6_10", Args, Verbose); auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, - AFill * BFill * K, /*Increment=*/false); + AFill * BFill * K, /*Increment=*/false); auto Op = createComputeOp(MatMatMulShader, "cs_6_10", "UAV(u0)", Args.c_str()); @@ -1024,8 +1027,9 @@ static void runMatMatMulAccum(ID3D12Device *Device, compileShader(DxcSupport, MatMatMulAccumShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, - AFill * BFill * K + CFill, /*Increment=*/false); + auto Expected = + makeExpectedMat(Params.CompType, Params.M, Params.N, + AFill * BFill * K + CFill, /*Increment=*/false); auto Op = createComputeOp(MatMatMulAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); @@ -1099,7 +1103,7 @@ static void runMatAccum(ID3D12Device *Device, compileShader(DxcSupport, MatAccumShader, "cs_6_10", Args, Verbose); auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, - LHSFill + RHSFill, /*Increment=*/false); + LHSFill + RHSFill, /*Increment=*/false); auto Op = createComputeOp(MatAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); @@ -1173,24 +1177,27 @@ static void runMatVecMul(ID3D12Device *Device, compileShader(DxcSupport, MatVecMulShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpectedVec(Params.CompType, Params.M, - static_cast(FillValue * FillValue * Params.N), /*Increment=*/false); + auto Expected = + makeExpectedVec(Params.CompType, Params.M, + static_cast(FillValue * FillValue * Params.N), + /*Increment=*/false); - auto Op = createComputeOp(MatVecMulShader, "cs_6_10", - "SRV(t0), UAV(u1)", Args.c_str()); + auto Op = createComputeOp(MatVecMulShader, "cs_6_10", "SRV(t0), UAV(u1)", + Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = - runShaderOp(Device, DxcSupport, std::move(Op), - [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, - NumElements, /*StartingVal=*/ FillValue, /*Increment=*/false), - "Saw unsupported component type"); - }); + auto Result = runShaderOp( + Device, DxcSupport, std::move(Op), + [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumElements, + /*StartingVal=*/FillValue, + /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); @@ -1265,24 +1272,27 @@ static void runMatVecMulAdd(ID3D12Device *Device, compileShader(DxcSupport, MatVecMulAddShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpectedVec(Params.CompType, Params.M, - static_cast(FillValue * FillValue * Params.N + FillValue), /*Increment=*/false); + auto Expected = makeExpectedVec( + Params.CompType, Params.M, + static_cast(FillValue * FillValue * Params.N + FillValue), + /*Increment=*/false); - auto Op = createComputeOp(MatVecMulAddShader, "cs_6_10", - "SRV(t0), UAV(u1)", Args.c_str()); + auto Op = createComputeOp(MatVecMulAddShader, "cs_6_10", "SRV(t0), UAV(u1)", + Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = - runShaderOp(Device, DxcSupport, std::move(Op), - [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, - NumElements, /*StartingVal=*/ FillValue, /*Increment=*/false), - "Saw unsupported component type"); - }); + auto Result = runShaderOp( + Device, DxcSupport, std::move(Op), + [NumElements, Params, FillValue](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumElements, + /*StartingVal=*/FillValue, + /*Increment=*/false), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); @@ -1348,8 +1358,8 @@ static void runOuterProduct(ID3D12Device *Device, compileShader(DxcSupport, OuterProductShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, - 4, /*Increment=*/false); + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 4, + /*Increment=*/false); auto Op = createComputeOp(OuterProductShader, "cs_6_10", "UAV(u0), UAV(u1)", Args.c_str()); @@ -1361,8 +1371,9 @@ static void runOuterProduct(ID3D12Device *Device, auto Result = runShaderOp( Device, DxcSupport, std::move(Op), [NumVecElements, Params](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, NumVecElements, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumVecElements, /*StartingVal=*/2, /*Increment=*/false), "Saw unsupported component type"); }); @@ -1397,15 +1408,15 @@ static const char QueryAccumLayoutShader[] = R"( )"; static void runQueryAccumLayout(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - bool Verbose) { + dxc::SpecificDllLoader &DxcSupport, + bool Verbose) { std::string Args = "-HV 202x"; size_t BufferSize = elementSize(ComponentType::I32); compileShader(DxcSupport, QueryAccumLayoutShader, "cs_6_10", Args, Verbose); - auto Op = - createComputeOp(QueryAccumLayoutShader, "cs_6_10", "UAV(u0)", Args.c_str()); + auto Op = createComputeOp(QueryAccumLayoutShader, "cs_6_10", "UAV(u0)", + Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Output"); @@ -1416,7 +1427,8 @@ static void runQueryAccumLayout(ID3D12Device *Device, const uint32_t *Out = static_cast(OutData.data()); // Accum Layout must be A or B - VERIFY_IS_TRUE(Out[0] == static_cast(MatrixUse::A) || Out[0] == static_cast(MatrixUse::B)); + VERIFY_IS_TRUE(Out[0] == static_cast(MatrixUse::A) || + Out[0] == static_cast(MatrixUse::B)); if (Verbose) hlsl_test::LogCommentFmt(L"AccumulatorLayout = %u", Out[0]); } From aea86d58e396e50fdaad6fb84a725d704f9657d2 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 16:45:32 -0600 Subject: [PATCH 15/21] groupshared load --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index e3a8ac020f..e05e69ba2b 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -322,6 +322,9 @@ class DxilConf_SM610_LinAlg { TEST_METHOD(AccumulateDescriptor_Wave_16x16_F16); TEST_METHOD(AccumulateDescriptor_Thread_16x16_F16); + // Load/Store/Accumulate Memory + TEST_METHOD(LoadMemory_Wave_16x16_F16); + // Element access TEST_METHOD(ElementAccess_Wave_16x16_F16); TEST_METHOD(ElementSet_Wave_16x16_F16); @@ -1437,4 +1440,88 @@ void DxilConf_SM610_LinAlg::QueryAccumLayout() { runQueryAccumLayout(D3DDevice, DxcSupport, VerboseLogging); } +static const char LoadMemoryShader[] = R"( + RWByteAddressBuffer Input : register(u0); + RWByteAddressBuffer Output : register(u1); + + groupshared ELEM_TYPE GsData[M_DIM * N_DIM]; + + #define ELEM_PER_THREAD (M_DIM * N_DIM / NUMTHREADS) + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + for (uint I = 0; I < ELEM_PER_THREAD; ++I) { + uint Index = threadID * ELEM_PER_THREAD + I; + GsData[Index] = Input.Load(Index * ELEM_SIZE); + } + + GroupMemoryBarrierWithGroupSync(); + + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE, SCOPE)]] + Mat; + __builtin_LinAlg_MatrixLoadFromMemory( + Mat, GsData, OFFSET, STRIDE, LAYOUT); + __builtin_LinAlg_MatrixStoreToDescriptor( + Mat, Output, OFFSET, STRIDE, LAYOUT, 128); + } +)"; + +static void runLoadMemory(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << " -DOFFSET=" << 0; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, LoadMemoryShader, "cs_6_10", Args, + Verbose); + + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 1); + + auto Op = createComputeOp(LoadMemoryShader, "cs_6_10", "UAV(u0), UAV(u1)", + Args.c_str()); + addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Input"); + addRootUAV(Op.get(), 1, "Output"); + + auto Result = runShaderOp( + Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements), + "Saw unsupported component type"); + }); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::LoadMemory_Wave_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::A; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runLoadMemory(D3DDevice, DxcSupport, Params, VerboseLogging); +} + + } // namespace LinAlg From 36faa95643a320ef2b53ac8d9bc64945a439969d Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 17:29:08 -0600 Subject: [PATCH 16/21] store to memory --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index e05e69ba2b..51be761f1e 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -324,6 +324,7 @@ class DxilConf_SM610_LinAlg { // Load/Store/Accumulate Memory TEST_METHOD(LoadMemory_Wave_16x16_F16); + TEST_METHOD(StoreMemory_Wave_16x16_F16); // Element access TEST_METHOD(ElementAccess_Wave_16x16_F16); @@ -1523,5 +1524,73 @@ void DxilConf_SM610_LinAlg::LoadMemory_Wave_16x16_F16() { runLoadMemory(D3DDevice, DxcSupport, Params, VerboseLogging); } +static const char StoreMemoryShader[] = R"( + RWByteAddressBuffer Output : register(u0); + groupshared ELEM_TYPE GsData[M_DIM * N_DIM]; + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE, SCOPE)]] + Mat; + __builtin_LinAlg_FillMatrix(Mat, FILL_VALUE); + + __builtin_LinAlg_MatrixStoreToMemory( + Mat, GsData, OFFSET, STRIDE, LAYOUT); + + for (uint I = 0; I < M_DIM*N_DIM; ++I) { + Output.Store(I*ELEM_SIZE, GsData[I]); + } + } +)"; + +static void runStoreMemory(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float FillValue) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << " -DOFFSET=" << 0; + ExtraDefs << " -DFILL_VALUE=" << FillValue; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, StoreMemoryShader, "cs_6_10", Args, + Verbose); + + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, FillValue, /*Increment=*/false); + + auto Op = createComputeOp(StoreMemoryShader, "cs_6_10", "UAV(u0)", + Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::StoreMemory_Wave_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::A; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runStoreMemory(D3DDevice, DxcSupport, Params, VerboseLogging, /*FillValue=*/7.0f); +} } // namespace LinAlg From a1805a8f683c313705c87e63b33a1eaba7c325e4 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 18:07:26 -0600 Subject: [PATCH 17/21] Accum to memory --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 81 ++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 51be761f1e..fc95635fee 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -325,6 +325,7 @@ class DxilConf_SM610_LinAlg { // Load/Store/Accumulate Memory TEST_METHOD(LoadMemory_Wave_16x16_F16); TEST_METHOD(StoreMemory_Wave_16x16_F16); + TEST_METHOD(AccumulateMemory_Wave_16x16_F16); // Element access TEST_METHOD(ElementAccess_Wave_16x16_F16); @@ -1444,7 +1445,6 @@ void DxilConf_SM610_LinAlg::QueryAccumLayout() { static const char LoadMemoryShader[] = R"( RWByteAddressBuffer Input : register(u0); RWByteAddressBuffer Output : register(u1); - groupshared ELEM_TYPE GsData[M_DIM * N_DIM]; #define ELEM_PER_THREAD (M_DIM * N_DIM / NUMTHREADS) @@ -1593,4 +1593,83 @@ void DxilConf_SM610_LinAlg::StoreMemory_Wave_16x16_F16() { runStoreMemory(D3DDevice, DxcSupport, Params, VerboseLogging, /*FillValue=*/7.0f); } +static const char AccumulateMemoryShader[] = R"( + RWByteAddressBuffer Output : register(u0); + groupshared ELEM_TYPE GsData[M_DIM * N_DIM]; + + #define ELEM_PER_THREAD (M_DIM * N_DIM / NUMTHREADS) + + [WaveSize(4, 64)] + [numthreads(NUMTHREADS, 1, 1)] + void main(uint threadID : SV_GroupIndex) { + ELEM_TYPE fill = FILL_VALUE; + for (uint I = 0; I < ELEM_PER_THREAD; ++I) { + uint Index = threadID * ELEM_PER_THREAD + I; + GsData[Index] = fill; + } + + GroupMemoryBarrierWithGroupSync(); + + if (WaveReadLaneFirst(threadID) != 0) + return; + + __builtin_LinAlgMatrix + [[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE, SCOPE)]] + Mat; + __builtin_LinAlg_FillMatrix(Mat, FILL_VALUE); + + __builtin_LinAlg_MatrixAccumulateToMemory( + Mat, GsData, OFFSET, STRIDE, LAYOUT); + + for (uint I = 0; I < M_DIM*N_DIM; ++I) { + Output.Store(I*ELEM_SIZE, GsData[I]); + } + } +)"; + +static void runAccumulateMemory(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float FillValue) { + const size_t NumElements = Params.totalElements(); + const size_t BufferSize = Params.totalBytes(); + + std::stringstream ExtraDefs; + ExtraDefs << " -DOFFSET=" << 0; + ExtraDefs << " -DFILL_VALUE=" << FillValue; + + std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); + + compileShader(DxcSupport, AccumulateMemoryShader, "cs_6_10", Args, + Verbose); + + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, FillValue * 2, /*Increment=*/false); + + auto Op = createComputeOp(AccumulateMemoryShader, "cs_6_10", "UAV(u0)", + Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(Params.CompType, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::AccumulateMemory_Wave_16x16_F16() { + MatrixParams Params = {}; + Params.CompType = ComponentType::F16; + Params.M = 16; + Params.N = 16; + Params.Use = MatrixUse::Accumulator; + Params.Scope = MatrixScope::Wave; + Params.Layout = LinalgMatrixLayout::RowMajor; + Params.NumThreads = 64; + Params.Enable16Bit = true; + runAccumulateMemory(D3DDevice, DxcSupport, Params, VerboseLogging, /*FillValue=*/7.0f); +} + } // namespace LinAlg From 6272ab9fcfb74bb2e514c20060281894ce054662 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 18:42:44 -0600 Subject: [PATCH 18/21] Convert --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index fc95635fee..61eb8d7a02 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -348,6 +348,9 @@ class DxilConf_SM610_LinAlg { // Query Accumulator Layout TEST_METHOD(QueryAccumLayout); + // Convert + TEST_METHOD(Convert); + private: CComPtr D3DDevice; dxc::SpecificDllLoader DxcSupport; @@ -1672,4 +1675,51 @@ void DxilConf_SM610_LinAlg::AccumulateMemory_Wave_16x16_F16() { runAccumulateMemory(D3DDevice, DxcSupport, Params, VerboseLogging, /*FillValue=*/7.0f); } +static const char ConvertShader[] = R"( + #define CT_F16 8 + #define CT_F32 9 + + RWByteAddressBuffer Output : register(u0); + + [numthreads(1, 1, 1)] + void main() { + vector InVec = {1.0, 2.0, 3.0, 4.0}; + vector OutVec; + __builtin_LinAlg_Convert(OutVec, InVec, CT_F16, CT_F32); + Output.Store(0, OutVec.x); + Output.Store(4, OutVec.y); + Output.Store(8, OutVec.z); + Output.Store(12, OutVec.w); + } +)"; + +static void runConvert(ID3D12Device *Device, + dxc::SpecificDllLoader &DxcSupport, + bool Verbose) { + std::string Args = "-HV 202x"; + MatrixDim NumElements = 4; + size_t BufferSize = elementSize(ComponentType::F32) * NumElements; + + compileShader(DxcSupport, ConvertShader, "cs_6_10", Args, Verbose); + + auto Expected = makeExpectedVec(ComponentType::F32, NumElements, 1.0); + + auto Op = createComputeOp(ConvertShader, "cs_6_10", "UAV(u0)", + Args.c_str()); + addUAVBuffer(Op.get(), "Output", BufferSize, true); + addRootUAV(Op.get(), 0, "Output"); + + auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); + + MappedData OutData; + Result->Test->GetReadBackData("Output", &OutData); + + VERIFY_IS_TRUE(verifyComponentBuffer(ComponentType::F32, OutData.data(), + Expected, NumElements, Verbose)); +} + +void DxilConf_SM610_LinAlg::Convert() { + runConvert(D3DDevice, DxcSupport, VerboseLogging); +} + } // namespace LinAlg From 7649f266e5ace322e87e7793b7c3d1b650af0c33 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Tue, 14 Apr 2026 18:43:45 -0600 Subject: [PATCH 19/21] format --- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 65 +++++++++---------- 1 file changed, 32 insertions(+), 33 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 61eb8d7a02..1ef824eae3 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -1476,8 +1476,8 @@ static const char LoadMemoryShader[] = R"( )"; static void runLoadMemory(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -1486,8 +1486,7 @@ static void runLoadMemory(ID3D12Device *Device, std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); - compileShader(DxcSupport, LoadMemoryShader, "cs_6_10", Args, - Verbose); + compileShader(DxcSupport, LoadMemoryShader, "cs_6_10", Args, Verbose); auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, 1); @@ -1498,14 +1497,14 @@ static void runLoadMemory(ID3D12Device *Device, addRootUAV(Op.get(), 0, "Input"); addRootUAV(Op.get(), 1, "Output"); - auto Result = runShaderOp( - Device, DxcSupport, std::move(Op), - [NumElements, Params](LPCSTR Name, std::vector &Data, - st::ShaderOp *) { - VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, - NumElements), - "Saw unsupported component type"); - }); + auto Result = + runShaderOp(Device, DxcSupport, std::move(Op), + [NumElements, Params](LPCSTR Name, std::vector &Data, + st::ShaderOp *) { + VERIFY_IS_TRUE(fillInputBuffer(Name, Data, Params.CompType, + NumElements), + "Saw unsupported component type"); + }); MappedData OutData; Result->Test->GetReadBackData("Output", &OutData); @@ -1552,9 +1551,9 @@ static const char StoreMemoryShader[] = R"( )"; static void runStoreMemory(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose, - float FillValue) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float FillValue) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -1564,13 +1563,13 @@ static void runStoreMemory(ID3D12Device *Device, std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); - compileShader(DxcSupport, StoreMemoryShader, "cs_6_10", Args, - Verbose); + compileShader(DxcSupport, StoreMemoryShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, FillValue, /*Increment=*/false); + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, + FillValue, /*Increment=*/false); - auto Op = createComputeOp(StoreMemoryShader, "cs_6_10", "UAV(u0)", - Args.c_str()); + auto Op = + createComputeOp(StoreMemoryShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Output"); @@ -1593,7 +1592,8 @@ void DxilConf_SM610_LinAlg::StoreMemory_Wave_16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runStoreMemory(D3DDevice, DxcSupport, Params, VerboseLogging, /*FillValue=*/7.0f); + runStoreMemory(D3DDevice, DxcSupport, Params, VerboseLogging, + /*FillValue=*/7.0f); } static const char AccumulateMemoryShader[] = R"( @@ -1631,9 +1631,9 @@ static const char AccumulateMemoryShader[] = R"( )"; static void runAccumulateMemory(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - const MatrixParams &Params, bool Verbose, - float FillValue) { + dxc::SpecificDllLoader &DxcSupport, + const MatrixParams &Params, bool Verbose, + float FillValue) { const size_t NumElements = Params.totalElements(); const size_t BufferSize = Params.totalBytes(); @@ -1643,10 +1643,10 @@ static void runAccumulateMemory(ID3D12Device *Device, std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str()); - compileShader(DxcSupport, AccumulateMemoryShader, "cs_6_10", Args, - Verbose); + compileShader(DxcSupport, AccumulateMemoryShader, "cs_6_10", Args, Verbose); - auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, FillValue * 2, /*Increment=*/false); + auto Expected = makeExpectedMat(Params.CompType, Params.M, Params.N, + FillValue * 2, /*Increment=*/false); auto Op = createComputeOp(AccumulateMemoryShader, "cs_6_10", "UAV(u0)", Args.c_str()); @@ -1672,7 +1672,8 @@ void DxilConf_SM610_LinAlg::AccumulateMemory_Wave_16x16_F16() { Params.Layout = LinalgMatrixLayout::RowMajor; Params.NumThreads = 64; Params.Enable16Bit = true; - runAccumulateMemory(D3DDevice, DxcSupport, Params, VerboseLogging, /*FillValue=*/7.0f); + runAccumulateMemory(D3DDevice, DxcSupport, Params, VerboseLogging, + /*FillValue=*/7.0f); } static const char ConvertShader[] = R"( @@ -1693,9 +1694,8 @@ static const char ConvertShader[] = R"( } )"; -static void runConvert(ID3D12Device *Device, - dxc::SpecificDllLoader &DxcSupport, - bool Verbose) { +static void runConvert(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, + bool Verbose) { std::string Args = "-HV 202x"; MatrixDim NumElements = 4; size_t BufferSize = elementSize(ComponentType::F32) * NumElements; @@ -1704,8 +1704,7 @@ static void runConvert(ID3D12Device *Device, auto Expected = makeExpectedVec(ComponentType::F32, NumElements, 1.0); - auto Op = createComputeOp(ConvertShader, "cs_6_10", "UAV(u0)", - Args.c_str()); + auto Op = createComputeOp(ConvertShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); addRootUAV(Op.get(), 0, "Output"); From 7e306f026220206afa658965fae6a00d67891653 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Wed, 15 Apr 2026 11:39:31 -0600 Subject: [PATCH 20/21] address comments --- .../unittests/HLSLExec/HlslExecTestUtils.cpp | 3 +- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 42 +++++++++---------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp index d2c3cac0b2..304110825b 100644 --- a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp +++ b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp @@ -751,8 +751,7 @@ void compileShader(dxc::SpecificDllLoader &DxcSupport, const char *Source, if (VerboseLogging) { hlsl_test::LogCommentFmt(L"Shader Source:"); - hlsl_test::LogCommentFmt( - std::wstring(Source, Source + strlen(Source)).c_str()); + hlsl_test::LogCommentFmt(L"%S", Source); } hlsl_test::LogCommentFmt(LogFlags.str().c_str()); diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 1ef824eae3..5a7a3122b1 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -407,8 +407,8 @@ static const char LoadStoreDescriptorShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -480,8 +480,8 @@ static const char SplatStoreShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -545,8 +545,8 @@ static const char AccumulateDescriptorShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -639,7 +639,7 @@ static const char ElementAccessShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -735,8 +735,8 @@ static const char ElementSetShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -815,8 +815,8 @@ static const char CopyConvertShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -912,8 +912,8 @@ static const char MatMatMulShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -991,8 +991,8 @@ static const char MatMatMulAccumShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -1074,8 +1074,8 @@ static const char MatAccumShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -1462,7 +1462,7 @@ static const char LoadMemoryShader[] = R"( GroupMemoryBarrierWithGroupSync(); - if (WaveReadLaneFirst(threadID) != 0) + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -1532,8 +1532,8 @@ static const char StoreMemoryShader[] = R"( [WaveSize(4, 64)] [numthreads(NUMTHREADS, 1, 1)] - void main(uint threadID : SV_GroupIndex) { - if (WaveReadLaneFirst(threadID) != 0) + void main() { + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix @@ -1613,7 +1613,7 @@ static const char AccumulateMemoryShader[] = R"( GroupMemoryBarrierWithGroupSync(); - if (WaveReadLaneFirst(threadID) != 0) + if (GetGroupWaveIndex() != 0) return; __builtin_LinAlgMatrix From c6314b200c1dbc8ab8e76481b64d3f815622db73 Mon Sep 17 00:00:00 2001 From: Ashley Coleman Date: Thu, 16 Apr 2026 10:34:37 -0600 Subject: [PATCH 21/21] address comments --- .../unittests/HLSLExec/HlslExecTestUtils.cpp | 25 +++++++- .../unittests/HLSLExec/HlslExecTestUtils.h | 8 ++- .../clang/unittests/HLSLExec/LinAlgTests.cpp | 62 +++++++++---------- 3 files changed, 61 insertions(+), 34 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp index 304110825b..8715ea1ac4 100644 --- a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp +++ b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.cpp @@ -691,7 +691,30 @@ void addUAVBuffer(st::ShaderOp *Op, const char *Name, UINT64 Width, Op->Resources.push_back(Res); } -void addRootUAV(st::ShaderOp *Op, UINT Index, const char *ResName) { +void addSRVBuffer(st::ShaderOp *Op, const char *Name, UINT64 Width, + const char *Init) { + st::ShaderOpResource Res = {}; + Res.Name = Op->Strings.insert(Name); + Res.Init = Op->Strings.insert(Init); + Res.ReadBack = FALSE; + + Res.HeapProperties.Type = D3D12_HEAP_TYPE_DEFAULT; + Res.HeapFlags = D3D12_HEAP_FLAG_NONE; + Res.Desc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER; + Res.Desc.Width = Width; + Res.Desc.Height = 1; + Res.Desc.DepthOrArraySize = 1; + Res.Desc.MipLevels = 1; + Res.Desc.SampleDesc.Count = 1; + Res.Desc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR; + Res.Desc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS; + Res.InitialResourceState = D3D12_RESOURCE_STATE_COPY_DEST; + Res.TransitionTo = D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE; + + Op->Resources.push_back(Res); +} + +void addRootView(st::ShaderOp *Op, UINT Index, const char *ResName) { st::ShaderOpRootValue RV = {}; RV.ResName = Op->Strings.insert(ResName); RV.HeapName = nullptr; diff --git a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h index 3e64171478..435a091a54 100644 --- a/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h +++ b/tools/clang/unittests/HLSLExec/HlslExecTestUtils.h @@ -88,8 +88,12 @@ createComputeOp(const char *Source, const char *Target, const char *RootSig, void addUAVBuffer(st::ShaderOp *Op, const char *Name, UINT64 Width, bool ReadBack, const char *Init = "zero"); -/// Bind a resource to a root UAV parameter by index. -void addRootUAV(st::ShaderOp *Op, UINT Index, const char *ResName); +/// Add a SRV buffer resource to a ShaderOp. +void addSRVBuffer(st::ShaderOp *Op, const char *Name, UINT64 Width, + const char *Init = "zero"); + +/// Bind a resource to a root view parameter by index. +void addRootView(st::ShaderOp *Op, UINT Index, const char *ResName); /// Run a programmatically-built ShaderOp and return the result. std::shared_ptr diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp index 5a7a3122b1..d9d22863f1 100644 --- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp +++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp @@ -443,8 +443,8 @@ static void runLoadStoreDescriptor(ID3D12Device *Device, "UAV(u0), UAV(u1)", Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op), @@ -513,7 +513,7 @@ static void runSplatStore(ID3D12Device *Device, auto Op = createComputeOp(SplatStoreShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); @@ -578,10 +578,10 @@ static void runAccumulateDescriptor(ID3D12Device *Device, auto Op = createComputeOp(AccumulateDescriptorShader, "cs_6_10", "SRV(t0), UAV(u1)", Args.c_str()); - addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); + addSRVBuffer(Op.get(), "Input", BufferSize, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp( Device, DxcSupport, std::move(Op), @@ -685,8 +685,8 @@ static void runElementAccess(ID3D12Device *Device, Args.c_str()); addUAVBuffer(Op.get(), "Input", MatrixSize, false, "byname"); addUAVBuffer(Op.get(), "Output", OutputBufSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op), @@ -776,8 +776,8 @@ static void runElementSet(ID3D12Device *Device, Args.c_str()); addUAVBuffer(Op.get(), "Input", MatrixSize, false, "byname"); addUAVBuffer(Op.get(), "Output", MatrixSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op), @@ -856,8 +856,8 @@ static void runCopyConvert(ID3D12Device *Device, Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op), @@ -958,7 +958,7 @@ static void runMatMatMul(ID3D12Device *Device, auto Op = createComputeOp(MatMatMulShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); @@ -1042,7 +1042,7 @@ static void runMatMatMulAccum(ID3D12Device *Device, auto Op = createComputeOp(MatMatMulAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); @@ -1115,7 +1115,7 @@ static void runMatAccum(ID3D12Device *Device, auto Op = createComputeOp(MatAccumShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); @@ -1154,7 +1154,7 @@ static const char MatVecMulShader[] = R"( __builtin_LinAlg_MatrixLoadFromDescriptor( Mat, Input, 0, STRIDE, LAYOUT, 128); - vector InVec; + vector InVec; for (uint I = 0; I < M_DIM; ++I) { InVec[I] = Input.Load(I * ELEM_SIZE); } @@ -1192,10 +1192,10 @@ static void runMatVecMul(ID3D12Device *Device, auto Op = createComputeOp(MatVecMulShader, "cs_6_10", "SRV(t0), UAV(u1)", Args.c_str()); - addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); + addSRVBuffer(Op.get(), "Input", BufferSize, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp( Device, DxcSupport, std::move(Op), @@ -1242,7 +1242,7 @@ static const char MatVecMulAddShader[] = R"( __builtin_LinAlg_MatrixLoadFromDescriptor( Mat, Input, 0, STRIDE, LAYOUT, 128); - vector InVec; + vector InVec; for (uint I = 0; I < M_DIM; ++I) { InVec[I] = Input.Load(I * ELEM_SIZE); } @@ -1287,10 +1287,10 @@ static void runMatVecMulAdd(ID3D12Device *Device, auto Op = createComputeOp(MatVecMulAddShader, "cs_6_10", "SRV(t0), UAV(u1)", Args.c_str()); - addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); + addSRVBuffer(Op.get(), "Input", BufferSize, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp( Device, DxcSupport, std::move(Op), @@ -1373,8 +1373,8 @@ static void runOuterProduct(ID3D12Device *Device, Args.c_str()); addUAVBuffer(Op.get(), "Input", InBuffSize, false, "byname"); addUAVBuffer(Op.get(), "Output", OutBufferSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp( Device, DxcSupport, std::move(Op), @@ -1426,7 +1426,7 @@ static void runQueryAccumLayout(ID3D12Device *Device, auto Op = createComputeOp(QueryAccumLayoutShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); @@ -1494,8 +1494,8 @@ static void runLoadMemory(ID3D12Device *Device, Args.c_str()); addUAVBuffer(Op.get(), "Input", BufferSize, false, "byname"); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Input"); - addRootUAV(Op.get(), 1, "Output"); + addRootView(Op.get(), 0, "Input"); + addRootView(Op.get(), 1, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op), @@ -1571,7 +1571,7 @@ static void runStoreMemory(ID3D12Device *Device, auto Op = createComputeOp(StoreMemoryShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); @@ -1651,7 +1651,7 @@ static void runAccumulateMemory(ID3D12Device *Device, auto Op = createComputeOp(AccumulateMemoryShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op)); @@ -1706,7 +1706,7 @@ static void runConvert(ID3D12Device *Device, dxc::SpecificDllLoader &DxcSupport, auto Op = createComputeOp(ConvertShader, "cs_6_10", "UAV(u0)", Args.c_str()); addUAVBuffer(Op.get(), "Output", BufferSize, true); - addRootUAV(Op.get(), 0, "Output"); + addRootView(Op.get(), 0, "Output"); auto Result = runShaderOp(Device, DxcSupport, std::move(Op));