From fb890811c304a13d2318283b3b012b2804ee737c Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 6 May 2025 14:40:37 -0400 Subject: [PATCH 1/6] Clean up vector handling code by introducing TestVector --- tools/clang/unittests/HLSLExec/CoopVec.h | 200 +++++++ .../unittests/HLSLExec/ExecutionTest.cpp | 532 +++++++----------- 2 files changed, 416 insertions(+), 316 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index f166c61f67..cd24a556bd 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -4,6 +4,8 @@ #include #include + +#include #include #include "dxc/Support/microcom.h" @@ -61,6 +63,7 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler { }; namespace CoopVecHelpers { + template static std::vector CreateAllOnesInputMatrix(uint32_t Width, uint32_t Height) { @@ -354,6 +357,203 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) { return D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32; } } + +struct TestVector { +private: + size_t NumVectors = 0; + size_t VectorSize = 0; + size_t ElementSize = 0; + size_t Stride = 0; + size_t TotalBytes = 0; + uint8_t *Buffer = nullptr; + +public: + TestVector(size_t NumVectors, size_t VectorSize, size_t ElementSize, + size_t Alignment = 16) + : NumVectors(NumVectors), VectorSize(VectorSize), + ElementSize(ElementSize) { + if (NumVectors == 0) { + throw std::invalid_argument("NumVectors must be greater than 0"); + } + if (VectorSize == 0) { + throw std::invalid_argument("VectorSize must be greater than 0"); + } + if (ElementSize == 0) { + throw std::invalid_argument("ElementSize must be greater than 0"); + } + + size_t VectorBytes = VectorSize * ElementSize; + Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment; + TotalBytes = Stride * NumVectors; + + void *Ptr = nullptr; +#ifdef _MSC_VER + Ptr = _aligned_malloc(TotalBytes, Alignment); +#else + Ptr = std::aligned_alloc(Alignment, TotalBytes); +#endif + Buffer = reinterpret_cast(Ptr); + std::fill(Buffer, Buffer + TotalBytes, (uint8_t)0xFF); + } + + // Copy constructor + TestVector(const TestVector &other) + : NumVectors(other.NumVectors), VectorSize(other.VectorSize), + ElementSize(other.ElementSize), Stride(other.Stride), + TotalBytes(other.TotalBytes) { + + void *Ptr = nullptr; +#ifdef _MSC_VER + Ptr = _aligned_malloc(TotalBytes, 16); +#else + Ptr = std::aligned_alloc(16, TotalBytes); +#endif + Buffer = reinterpret_cast(Ptr); + + if (other.Buffer) { + std::memcpy(Buffer, other.Buffer, TotalBytes); + } + } + + // Move constructor + TestVector(TestVector &&other) noexcept + : NumVectors(other.NumVectors), VectorSize(other.VectorSize), + ElementSize(other.ElementSize), Stride(other.Stride), + TotalBytes(other.TotalBytes), Buffer(other.Buffer) { + + // Reset the source object + other.NumVectors = 0; + other.VectorSize = 0; + other.ElementSize = 0; + other.Stride = 0; + other.TotalBytes = 0; + other.Buffer = nullptr; + } + + ~TestVector() { + if (Buffer) { +#ifdef _MSC_VER + _aligned_free(Buffer); +#else + std::free(Buffer); +#endif + } + } + + size_t getNumVectors() const { return NumVectors; } + size_t getVectorSize() const { return VectorSize; } + size_t getElementSize() const { return ElementSize; } + size_t getStride() const { return Stride; } + size_t getTotalBytes() const { return TotalBytes; } + uint8_t *getBuffer() { return Buffer; } + const uint8_t *getBuffer() const { return Buffer; } + + template T *getVector(size_t I) { + uint8_t *Ptr = Buffer + I * Stride; + return reinterpret_cast(Ptr); + } + + template const T *getVector(size_t I) const { + const uint8_t *Ptr = Buffer + I * Stride; + return reinterpret_cast(Ptr); + } + + template void fill(const T &Value) { + for (size_t I = 0; I < NumVectors; ++I) { + T *Vec = getVector(I); + for (size_t J = 0; J < VectorSize; ++J) + Vec[J] = Value; + } + } + + template void fillSimpleTestData() { + // Create a vector of (1, 1, 0, ...) + for (size_t I = 0; I < NumVectors; ++I) { + T *Vec = getVector(I); + for (size_t J = 0; J < VectorSize; ++J) + if constexpr (std::is_same_v) { + // Special case for HALF, which requires conversion from float + Vec[J] = static_cast( + ConvertFloat32ToFloat16((J == 0 || J == 1) ? 1.0f : 0.0f)); + } else { + Vec[J] = static_cast((J == 0 || J == 1) ? 1 : 0); + } + } + } + + static TestVector + createSimpleTestVector(size_t NumVectors, size_t VectorSize, + D3D12_LINEAR_ALGEBRA_DATATYPE DataType, + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + size_t ElementSize; + switch (DataType) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + ElementSize = sizeof(int8_t); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + ElementSize = sizeof(int16_t); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) { + ElementSize = sizeof(int8_t); + } else { + ElementSize = sizeof(int32_t); + } + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + ElementSize = sizeof(DirectX::PackedVector::HALF); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + ElementSize = sizeof(float); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + TestVector Vec(NumVectors, VectorSize, ElementSize); + switch (DataType) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) { + Vec.fillSimpleTestData(); + } else { + Vec.fillSimpleTestData(); + } + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + Vec.fillSimpleTestData(); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + return Vec; + } +}; }; // namespace CoopVecHelpers #endif // HAVE_COOPVEC_API diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 55d569dd8d..f47b4624d6 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12241,6 +12241,112 @@ void ExecutionTest::runCoopVecMulSubtest( CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle( DescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + // Setup input data + auto ExpectedOutputBuffer = + std::make_unique(Config.OutputPerThread * Config.NumThreads); + + std::vector InputMatrix; + if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || + MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( + Config.InputPerThread, Config.OutputPerThread); + } else if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { + // Matrix source data is fp32, which gets converted to fp16 during matrix + // conversion + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( + Config.InputPerThread, Config.OutputPerThread); + } else { + WEX::Logging::Log::Error(L"Unsupported matrix data type"); + return; + } + + auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector( + Config.NumThreads, Config.InputPerThread, MulProps.InputType, + MulProps.InputInterpretation); + auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector( + 1, Config.OutputPerThread, MulProps.BiasInterpretation, + MulProps.BiasInterpretation); + + // Calculate reference output + // FIXME: This does not capture all cases, but is sufficient for the preview + // feature set + if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { + int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer(); + float *InputVectorF32 = (float *)InputVector.getBuffer(); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { + int Acc = 0; + + for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { + int InputElem; + if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + InputElem = (int) + InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx]; + } else { + InputElem = InputVector.getVector(ThreadIdx)[InputIdx]; + } + int const MatrixElem = + InputMatrix[OutputIdx * Config.InputPerThread + InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (Config.Bias) { + Acc += InputBiasI32[OutputIdx]; + } + + float Result = float(Acc); + ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = + Result; + } + } + } else if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { + DirectX::PackedVector::HALF *InputVectorFP16 = + (DirectX::PackedVector::HALF *)InputVector.getBuffer(); + DirectX::PackedVector::HALF *InputBiasFP16 = + (DirectX::PackedVector::HALF *)InputBias.getBuffer(); + + // The CPU reference matrix is float + std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); + std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { + float Acc = 0; + + for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { + float const InputElem = ConvertFloat16ToFloat32( + InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]); + float const MatrixElem = + InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (Config.Bias) { + Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); + } + + float Result = Acc; + ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = + Result; + } + } + } + // Create the compute pipeline state for the CoopVec shader CComPtr ComputePipelineState; { @@ -12258,9 +12364,7 @@ void main(uint threadIdx : SV_GroupThreadID) { using namespace dx::linalg; - // Ensure 4-byte alignment for vector loads - uint inputOffset = (INPUT_PER_THREAD * threadIdx * (sizeof(INPUT_DATA_TYPE) / INPUT_DIVISOR)); - inputOffset = (inputOffset + 3) & ~3; // Align to 4 bytes + uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE); vector input = InputVector.Load >(inputOffset); MatrixRef mat = { InputMatrix, 0, STRIDE }; @@ -12278,7 +12382,6 @@ void main(uint threadIdx : SV_GroupThreadID) // Ensure 4-byte alignment for vector store uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float); - outputOffset = (outputOffset + 3) & ~3; // Align to 4 bytes OutputBuffer.Store >(outputOffset, result); } )"; @@ -12349,6 +12452,8 @@ void main(uint threadIdx : SV_GroupThreadID) auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0); auto AccumInterpretationEnumDefine = CreateDefineFromString( L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum); + auto InputVectorStrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride()); LPCWSTR Options[] = { L"-enable-16bit-types", @@ -12364,6 +12469,7 @@ void main(uint threadIdx : SV_GroupThreadID) MatrixDataTypeEnumDefine.c_str(), UseBiasDefine.c_str(), AccumInterpretationEnumDefine.c_str(), + InputVectorStrideDefine.c_str(), }; CComPtr IncludeHandler = @@ -12388,36 +12494,9 @@ void main(uint threadIdx : SV_GroupThreadID) 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, IID_PPV_ARGS(&CommandList))); - // Setup input data - auto ExpectedOutputBuffer = - std::make_unique(Config.OutputPerThread * Config.NumThreads); - // Setup input matrix as all-ones in sint8 format. This will later be // converted to the appropriate data type by the matrix conversion API. CComPtr InputMatrixSRVResource, InputMatrixSRVUploadResource; - std::vector InputMatrix; - if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // Matrix source data is fp32, which gets converted to fp16 during matrix - // conversion - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported matrix data type"); - return; - } CreateTestResources(D3DDevice, CommandList, InputMatrix.data(), InputMatrix.size(), @@ -12427,180 +12506,31 @@ void main(uint threadIdx : SV_GroupThreadID) // Create input vector of an appropriate type. All integer types start as // SINT8 for now. CComPtr InputVecSRVResource, InputVecSRVUploadResource; - std::vector InputVector; - - if ((MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32 && - (MulProps.InputInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.InputInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED)) || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - InputVector = - CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported input data type"); - return; - } - if (InputVector.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputVector.resize(InputVector.size() + 4 - (InputVector.size() % 4)); - } - CreateTestResources(D3DDevice, CommandList, InputVector.data(), - InputVector.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputVector.size()), - &InputVecSRVResource, &InputVecSRVUploadResource); + + CreateTestResources( + D3DDevice, CommandList, InputVector.getBuffer(), + InputVector.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputVector.getTotalBytes()), + &InputVecSRVResource, &InputVecSRVUploadResource); // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector.size() / sizeof(int32_t)), + (UINT)(InputVector.getTotalBytes() / sizeof(int32_t)), InputVecSRVResource); // Create input bias CComPtr InputBiasSRVResource, InputBiasSRVUploadResource; - std::vector InputBias; - if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || - MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputBias = CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) { - InputBias = - CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) { - InputBias = - CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { - InputBias = CoopVecHelpers::CreateInputBias( - Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputBias = CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported bias data type"); - return; - } - - if (InputBias.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputBias.resize(InputBias.size() + 4 - (InputBias.size() % 4)); - } - CreateTestResources(D3DDevice, CommandList, InputBias.data(), - InputBias.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputBias.size()), + CreateTestResources(D3DDevice, CommandList, InputBias.getBuffer(), + InputBias.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputBias.getTotalBytes()), &InputBiasSRVResource, &InputBiasSRVUploadResource); // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputBias.size() / sizeof(int32_t)), + (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)), InputBiasSRVResource); - // Calculate reference output - // FIXME: This does not capture all cases, but is sufficient for the preview - // feature set - if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { - // The input bias is really an array of int32_t - std::vector InputBiasI32(InputBias.size() / sizeof(int32_t)); - std::memcpy(InputBiasI32.data(), InputBias.data(), InputBias.size()); - - // The input vector is really an array of float if our vector input type is - // FLOAT32 - std::vector InputVectorF32(InputVector.size() / sizeof(int32_t)); - if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - std::memcpy(InputVectorF32.data(), InputVector.data(), - InputVector.size()); - } - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - int Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - int InputElem; - if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputElem = (int) - InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx]; - } else { - InputElem = - InputVector[ThreadIdx * Config.InputPerThread + InputIdx]; - } - int const MatrixElem = - InputMatrix[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += InputBiasI32[OutputIdx]; - } - - float Result = float(Acc); - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // The input bias/vector is really an array of float16 - std::vector InputVectorFP16( - InputVector.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputVectorFP16.data(), InputVector.data(), InputVector.size()); - - std::vector InputBiasFP16( - InputBias.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputBiasFP16.data(), InputBias.data(), InputBias.size()); - - // The CPU reference matrix is float - std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); - std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - float Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - float const InputElem = ConvertFloat16ToFloat32( - InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]); - float const MatrixElem = - InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); - } - - float Result = Acc; - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } - } - CComPtr ConvertedMatrixResource; { // Create source matrix info @@ -12862,6 +12792,80 @@ void ExecutionTest::runCoopVecOuterProductSubtest( CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle( DescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + // Setup input matrix as all-ones in sint8/fp32 format. This will later be + // converted to the appropriate data type by the matrix conversion API. + + std::vector InputMatrix; + if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, + Config.DimM); + } else if (AccumulateProps.AccumulationType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || + AccumulateProps.AccumulationType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || + AccumulateProps.AccumulationType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { + // Matrix source data is fp32, which gets converted to fp16 during matrix + // conversion + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, + Config.DimM); + } else { + WEX::Logging::Log::Error(L"Unsupported matrix data type"); + return; + } + + // Create input vectors + auto InputVector1 = CoopVecHelpers::TestVector::createSimpleTestVector( + Config.NumThreads, Config.DimM, AccumulateProps.InputType, + AccumulateProps.InputType); + auto InputVector2 = CoopVecHelpers::TestVector::createSimpleTestVector( + Config.NumThreads, Config.DimN, AccumulateProps.InputType, + AccumulateProps.InputType); + + // Calculate reference output + auto ExpectedOutputBufferI8 = + CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, Config.DimM); + std::vector ExpectedOutputBuffer(ExpectedOutputBufferI8.size() / + sizeof(float)); + std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(), + ExpectedOutputBufferI8.size()); + + if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { + DirectX::PackedVector::HALF *InputVector1FP16 = + reinterpret_cast( + InputVector1.getBuffer()); + DirectX::PackedVector::HALF *InputVector2FP16 = + reinterpret_cast( + InputVector2.getBuffer()); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int M = 0; M < Config.DimM; ++M) { + for (int N = 0; N < Config.DimN; ++N) { + float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) * + ConvertFloat16ToFloat32(InputVector2FP16[N]); + ExpectedOutputBuffer[M * Config.DimN + N] += acc; + } + } + } + } else if (AccumulateProps.InputType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + float *InputVector1FP32 = + reinterpret_cast(InputVector1.getBuffer()); + float *InputVector2FP32 = + reinterpret_cast(InputVector2.getBuffer()); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int M = 0; M < Config.DimM; ++M) { + for (int N = 0; N < Config.DimN; ++N) { + float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] * + InputVector2FP32[ThreadIdx * Config.DimN + N]; + ExpectedOutputBuffer[M * Config.DimN + N] += Acc; + } + } + } + } + // Create a compute pipeline state object. CComPtr ComputePipelineState; { @@ -12880,12 +12884,10 @@ void main(uint threadIdx : SV_GroupThreadID) using namespace dx::linalg; // Ensure 4-byte alignment for vector loads - uint inputOffset1 = (DIM_M * threadIdx * sizeof(INPUT_DATA_TYPE)); - inputOffset1 = (inputOffset1 + 3) & ~3; // Align to 4 bytes + uint inputOffset1 = threadIdx * INPUT_VECTOR_1_STRIDE; vector input1 = InputVector1.Load >(inputOffset1); - uint inputOffset2 = (DIM_N * threadIdx * sizeof(INPUT_DATA_TYPE)); - inputOffset2 = (inputOffset2 + 3) & ~3; // Align to 4 bytes + uint inputOffset2 = threadIdx * INPUT_VECTOR_2_STRIDE; vector input2 = InputVector2.Load >(inputOffset2); RWMatrixRef mat = { AccumMatrix, 0, STRIDE }; @@ -12954,6 +12956,10 @@ void main(uint threadIdx : SV_GroupThreadID) CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str()); auto MatrixDataTypeEnumDefine = CreateDefineFromString( L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str()); + auto InputVector1StrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride()); + auto InputVector2StrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride()); LPCWSTR Options[] = { L"-enable-16bit-types", @@ -12967,6 +12973,8 @@ void main(uint threadIdx : SV_GroupThreadID) InputInterpretationEnumDefine.c_str(), HlslMatrixLayoutDefine.c_str(), MatrixDataTypeEnumDefine.c_str(), + InputVector1StrideDefine.c_str(), + InputVector2StrideDefine.c_str(), }; CComPtr IncludeHandler = @@ -12991,142 +12999,34 @@ void main(uint threadIdx : SV_GroupThreadID) 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, IID_PPV_ARGS(&CommandList))); - // Setup input matrix as all-ones in sint8/fp32 format. This will later be - // converted to the appropriate data type by the matrix conversion API. CComPtr InputMatrixSRVResource, InputMatrixSRVUploadResource; - std::vector InputMatrix; - if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, - Config.DimM); - } else if (AccumulateProps.AccumulationType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - AccumulateProps.AccumulationType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - AccumulateProps.AccumulationType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // Matrix source data is fp32, which gets converted to fp16 during matrix - // conversion - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, - Config.DimM); - } else { - WEX::Logging::Log::Error(L"Unsupported matrix data type"); - return; - } - CreateTestResources(D3DDevice, CommandList, InputMatrix.data(), InputMatrix.size(), CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()), &InputMatrixSRVResource, &InputMatrixSRVUploadResource); - // Create input vectors CComPtr InputVecSRVResource1, InputVecSRVUploadResource1; - std::vector InputVector1; CComPtr InputVecSRVResource2, InputVecSRVUploadResource2; - std::vector InputVector2; - - if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputVector1 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimM); - InputVector2 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimN); - } else if (AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - InputVector1 = - CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.DimM); - InputVector2 = - CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.DimN); - } else if (AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputVector1 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimM); - InputVector2 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimN); - } else { - WEX::Logging::Log::Error(L"Unsupported input data type"); - return; - } - if (InputVector1.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputVector1.resize(InputVector1.size() + 4 - (InputVector1.size() % 4)); - } - if (InputVector2.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputVector2.resize(InputVector2.size() + 4 - (InputVector2.size() % 4)); - } - CreateTestResources(D3DDevice, CommandList, InputVector1.data(), - InputVector1.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputVector1.size()), - &InputVecSRVResource1, &InputVecSRVUploadResource1); - CreateTestResources(D3DDevice, CommandList, InputVector2.data(), - InputVector2.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputVector2.size()), - &InputVecSRVResource2, &InputVecSRVUploadResource2); + + CreateTestResources( + D3DDevice, CommandList, InputVector1.getBuffer(), + InputVector1.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputVector1.getTotalBytes()), + &InputVecSRVResource1, &InputVecSRVUploadResource1); + CreateTestResources( + D3DDevice, CommandList, InputVector2.getBuffer(), + InputVector2.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputVector2.getTotalBytes()), + &InputVecSRVResource2, &InputVecSRVUploadResource2); // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector1.size() / sizeof(int32_t)), + (UINT)(InputVector1.getTotalBytes() / sizeof(int32_t)), InputVecSRVResource1); CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector2.size() / sizeof(int32_t)), + (UINT)(InputVector2.getTotalBytes() / sizeof(int32_t)), InputVecSRVResource2); - // Calculate reference output - auto ExpectedOutputBufferI8 = - CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, Config.DimM); - std::vector ExpectedOutputBuffer(ExpectedOutputBufferI8.size() / - sizeof(float)); - std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(), - ExpectedOutputBufferI8.size()); - - if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { - std::vector InputVector1FP16( - InputVector1.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputVector1FP16.data(), InputVector1.data(), - InputVector1.size()); - - std::vector InputVector2FP16( - InputVector2.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputVector2FP16.data(), InputVector2.data(), - InputVector2.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int M = 0; M < Config.DimM; ++M) { - for (int N = 0; N < Config.DimN; ++N) { - float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) * - ConvertFloat16ToFloat32(InputVector2FP16[N]); - ExpectedOutputBuffer[M * Config.DimN + N] += acc; - } - } - } - } else if (AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - std::vector InputVector1FP32(InputVector1.size() / sizeof(float)); - std::memcpy(InputVector1FP32.data(), InputVector1.data(), - InputVector1.size()); - - std::vector InputVector2FP32(InputVector2.size() / sizeof(float)); - std::memcpy(InputVector2FP32.data(), InputVector2.data(), - InputVector2.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int M = 0; M < Config.DimM; ++M) { - for (int N = 0; N < Config.DimN; ++N) { - float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] * - InputVector2FP32[ThreadIdx * Config.DimN + N]; - ExpectedOutputBuffer[M * Config.DimN + N] += Acc; - } - } - } - } - CComPtr ConvertedMatrixResource, ConvertedMatrixReadResource; int ConvertedMatrixSize = 0; { From 721087a382acf72acb888a0a49320226c48f8a28 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 6 May 2025 16:17:37 -0400 Subject: [PATCH 2/6] Support odd matrix/vector sizes --- .../unittests/HLSLExec/ExecutionTest.cpp | 84 +++++++++++++------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index f47b4624d6..934210af1f 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12149,6 +12149,14 @@ void ExecutionTest::runCoopVecMulTestConfig( {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, @@ -12157,6 +12165,14 @@ void ExecutionTest::runCoopVecMulTestConfig( {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, @@ -12165,6 +12181,14 @@ void ExecutionTest::runCoopVecMulTestConfig( {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, false}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, @@ -12181,6 +12205,22 @@ void ExecutionTest::runCoopVecMulTestConfig( false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, }; for (auto Config : TestConfigs) { @@ -12280,18 +12320,15 @@ void ExecutionTest::runCoopVecMulSubtest( // FIXME: This does not capture all cases, but is sufficient for the preview // feature set if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { - int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer(); - float *InputVectorF32 = (float *)InputVector.getBuffer(); - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + int32_t *InputBiasI32 = InputBias.getVector(0); for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { int Acc = 0; for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { int InputElem; if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputElem = (int) - InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx]; + InputElem = (int)InputVector.getVector(ThreadIdx)[InputIdx]; } else { InputElem = InputVector.getVector(ThreadIdx)[InputIdx]; } @@ -12315,22 +12352,21 @@ void ExecutionTest::runCoopVecMulSubtest( D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - DirectX::PackedVector::HALF *InputVectorFP16 = - (DirectX::PackedVector::HALF *)InputVector.getBuffer(); - DirectX::PackedVector::HALF *InputBiasFP16 = - (DirectX::PackedVector::HALF *)InputBias.getBuffer(); - // The CPU reference matrix is float std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + DirectX::PackedVector::HALF *InputVectorFP16 = + InputVector.getVector(ThreadIdx); + DirectX::PackedVector::HALF *InputBiasFP16 = + InputBias.getVector(0); for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { float Acc = 0; for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - float const InputElem = ConvertFloat16ToFloat32( - InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]); + float const InputElem = + ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]); float const MatrixElem = InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; Acc += InputElem * MatrixElem; @@ -12365,7 +12401,7 @@ void main(uint threadIdx : SV_GroupThreadID) using namespace dx::linalg; uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE); - vector input = InputVector.Load >(inputOffset); + vector input = InputVector.Load >(inputOffset); MatrixRef mat = { InputMatrix, 0, STRIDE }; @@ -12439,8 +12475,9 @@ void main(uint threadIdx : SV_GroupThreadID) auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); auto InputDataTypeDefine = CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType); - auto InputDivisorDefine = - CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor); + auto InputDivisorDefine = CreateDefineFromInt( + L"INPUT_VECTOR_NUM_ELEMENTS", + (Config.InputPerThread + InputDivisor - 1) / InputDivisor); auto AccumDataTypeDefine = CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType); auto InputInterpretationEnumDefine = CreateDefineFromString( @@ -12596,11 +12633,12 @@ void main(uint threadIdx : SV_GroupThreadID) &ConvertInfo.DestInfo); } + int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16; + // Create resource to hold matrix copy - CreateTestResources( - D3DDevice, CommandList, nullptr, 0, - CD3DX12_RESOURCE_DESC::Buffer(ConvertInfo.DestInfo.DestSize), - &ConvertedMatrixResource, nullptr); + CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize, + CD3DX12_RESOURCE_DESC::Buffer(SRVSize), + &ConvertedMatrixResource, nullptr); // Set up data descriptors ConvertInfo.DataDesc.DestVA = @@ -12613,13 +12651,7 @@ void main(uint threadIdx : SV_GroupThreadID) __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11)); CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); - // This increments baseHandle - if ((ConvertInfo.DestInfo.DestSize % 4) != 0) { - WEX::Logging::Log::Error(L"DestSize is not aligned to 4 bytes"); - return; - } - CreateRawSRV(D3DDevice, BaseHandle, - ConvertInfo.DestInfo.DestSize / sizeof(int32_t), + CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t), ConvertedMatrixResource); } From 5dde799dc0d0cff77468aa877e5446b2414dac36 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 13:10:59 -0400 Subject: [PATCH 3/6] Finish support for NumLayers=2 --- tools/clang/unittests/HLSLExec/CoopVec.h | 275 ++++++++++++ .../unittests/HLSLExec/ExecutionTest.cpp | 410 +++++++++--------- 2 files changed, 492 insertions(+), 193 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index cd24a556bd..b5c0a2f355 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -448,6 +448,74 @@ struct TestVector { uint8_t *getBuffer() { return Buffer; } const uint8_t *getBuffer() const { return Buffer; } + // Copy assignment operator + TestVector &operator=(const TestVector &other) { + if (this != &other) { + // Free existing buffer + if (Buffer) { +#ifdef _MSC_VER + _aligned_free(Buffer); +#else + std::free(Buffer); +#endif + Buffer = nullptr; + } + + // Copy metadata + NumVectors = other.NumVectors; + VectorSize = other.VectorSize; + ElementSize = other.ElementSize; + Stride = other.Stride; + TotalBytes = other.TotalBytes; + + // Allocate new buffer + void *Ptr = nullptr; +#ifdef _MSC_VER + Ptr = _aligned_malloc(TotalBytes, 16); +#else + Ptr = std::aligned_alloc(16, TotalBytes); +#endif + Buffer = reinterpret_cast(Ptr); + + // Copy data + if (other.Buffer) { + std::memcpy(Buffer, other.Buffer, TotalBytes); + } + } + return *this; + } + + // Move assignment operator + TestVector &operator=(TestVector &&other) noexcept { + if (this != &other) { + // Free existing buffer + if (Buffer) { +#ifdef _MSC_VER + _aligned_free(Buffer); +#else + std::free(Buffer); +#endif + } + + // Move metadata and buffer + NumVectors = other.NumVectors; + VectorSize = other.VectorSize; + ElementSize = other.ElementSize; + Stride = other.Stride; + TotalBytes = other.TotalBytes; + Buffer = other.Buffer; + + // Reset the source object + other.NumVectors = 0; + other.VectorSize = 0; + other.ElementSize = 0; + other.Stride = 0; + other.TotalBytes = 0; + other.Buffer = nullptr; + } + return *this; + } + template T *getVector(size_t I) { uint8_t *Ptr = Buffer + I * Stride; return reinterpret_cast(Ptr); @@ -481,6 +549,20 @@ struct TestVector { } } + template void fillAllOnesTestData() { + // Create a vector of (1, 1, 1, ...) + for (size_t I = 0; I < NumVectors; ++I) { + T *Vec = getVector(I); + for (size_t J = 0; J < VectorSize; ++J) + if constexpr (std::is_same_v) { + // Special case for HALF, which requires conversion from float + Vec[J] = static_cast(ConvertFloat32ToFloat16(1.0f)); + } else { + Vec[J] = static_cast(1); + } + } + } + static TestVector createSimpleTestVector(size_t NumVectors, size_t VectorSize, D3D12_LINEAR_ALGEBRA_DATATYPE DataType, @@ -553,6 +635,199 @@ struct TestVector { } return Vec; } + + static TestVector + createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize, + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + size_t ElementSize; + switch (DataInterpretation) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + ElementSize = sizeof(int8_t); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + ElementSize = sizeof(float); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + TestVector Vec(NumVectors, VectorSize, ElementSize); + switch (DataInterpretation) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + Vec.fillAllOnesTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + Vec.fillAllOnesTestData(); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + return Vec; + } + + D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO + getConversionInfo(ID3D12Device *D3DDevice, + D3D12_LINEAR_ALGEBRA_DATATYPE DestDataType, + D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout) { + // Create source matrix info + D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {}; + ConvertInfo.SrcInfo.SrcDataType = + ::CoopVecHelpers::GetMatrixSrcDataType(DestDataType); + ConvertInfo.SrcInfo.SrcLayout = + D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; + + // Create destination matrix info + ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver + int DestEltSize = 0; + switch (DestDataType) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED: + ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8; + DestEltSize = 1; + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16; + DestEltSize = 2; // FP16 + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + ConvertInfo.DestInfo.DestDataType = + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3; + DestEltSize = 1; // FP8 + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + ConvertInfo.DestInfo.DestDataType = + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2; + DestEltSize = 1; // FP8 + break; + } + ConvertInfo.SrcInfo.SrcStride = (UINT)getStride(); + ConvertInfo.SrcInfo.SrcSize = (UINT)getTotalBytes(); + + ConvertInfo.DestInfo.DestLayout = MatrixLayout; + ConvertInfo.DestInfo.DestStride = 0; + ConvertInfo.DestInfo.NumRows = (UINT)getNumVectors(); + ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize(); + + if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { + ConvertInfo.DestInfo.DestStride = (UINT)getVectorSize() * DestEltSize; + } else if (MatrixLayout == + D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { + ConvertInfo.DestInfo.DestStride = (UINT)getNumVectors() * DestEltSize; + } + + // Get destination size using preview interface + { + CComPtr PreviewDevice; + VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview), + (void **)&PreviewDevice)); + + // Query required destination size + PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo( + &ConvertInfo.DestInfo); + } + + return ConvertInfo; + } + + static TestVector + matrixVectorMultiply(const TestVector &Matrix, const TestVector &InputVector, + const TestVector &Bias, bool HasBias, + D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation, + D3D12_LINEAR_ALGEBRA_DATATYPE InputType) { + bool IsFP32 = false; + switch (MatrixInterpretation) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + IsFP32 = true; + break; + default: + break; + } + + TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(), + sizeof(float)); + + if (IsFP32) { + for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { + const DirectX::PackedVector::HALF *InputBiasFP16 = + Bias.getVector(0); + for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); + ++OutputIdx) { + float Acc = 0; + + for (int InputIdx = 0; InputIdx < Matrix.getVectorSize(); + ++InputIdx) { + float InputElem; + if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + InputElem = InputVector.getVector(VecIdx)[InputIdx]; + } else { + InputElem = ConvertFloat16ToFloat32( + InputVector.getVector( + VecIdx)[InputIdx]); + } + float const MatrixElem = + Matrix.getVector(OutputIdx)[InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (HasBias) { + Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); + } + + float Result = Acc; + ResultVec.getVector(VecIdx)[OutputIdx] = Result; + } + } + } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { + for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { + const int32_t *InputBiasI32 = Bias.getVector(0); + for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); + ++OutputIdx) { + int Acc = 0; + + for (int InputIdx = 0; InputIdx < Matrix.getVectorSize(); + ++InputIdx) { + int InputElem; + if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + InputElem = (int)InputVector.getVector(VecIdx)[InputIdx]; + } else { + InputElem = InputVector.getVector(VecIdx)[InputIdx]; + } + int const MatrixElem = + Matrix.getVector(OutputIdx)[InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (HasBias) { + Acc += InputBiasI32[OutputIdx]; + } + + float Result = float(Acc); + ResultVec.getVector(VecIdx)[OutputIdx] = Result; + } + } + } else { + throw std::invalid_argument("Unsupported matrix interpretation"); + } + + return ResultVec; + } }; }; // namespace CoopVecHelpers diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 934210af1f..a613f28139 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -789,7 +789,7 @@ class ExecutionTest { int InputPerThread; int OutputPerThread; int NumThreads; - int NumLevels; + int NumLayers; D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout; bool Bias; }; @@ -12221,6 +12221,88 @@ void ExecutionTest::runCoopVecMulTestConfig( false}, {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true}, + + // NumLayers=2 tests + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, }; for (auto Config : TestConfigs) { @@ -12234,6 +12316,21 @@ void ExecutionTest::runCoopVecMulTestConfig( continue; } + if (Config.NumLayers > 1 && + (MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 || + MulProps.InputInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + MulProps.InputInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) && + (MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 || + MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32)) { + // We do not support multi-layer tests with packed types as input with + // full-precision integer bias Supporting this in the current framework + // would require repacking the accumulator vectors + continue; + } + bool IsInFilter = CoopVecHelpers::IsMatrixLayoutInFilter( L"CoopVecMatrixLayout", Config.MatrixLayout); if (!IsInFilter) { @@ -12250,9 +12347,9 @@ void ExecutionTest::runCoopVecMulSubtest( LogCommentFmt( L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: " - L"%d, NumLevels: %d, Bias: %s, MatrixLayout: %s", + L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s", Config.InputPerThread, Config.OutputPerThread, Config.NumThreads, - Config.NumLevels, Config.Bias ? L"true" : L"false", + Config.NumLayers, Config.Bias ? L"true" : L"false", CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str()); const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4); @@ -12261,8 +12358,8 @@ void ExecutionTest::runCoopVecMulSubtest( CComPtr RootSignature; { CD3DX12_DESCRIPTOR_RANGE Ranges[2]; - Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 3, 0, - 0); // InputVector, InputMatrix, InputBias + Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0, + 0); // InputVector, InputBias, InputMatrices[] Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer CreateRootSignatureFromRanges(D3DDevice, &RootSignature, Ranges, 2, nullptr, 0); @@ -12273,7 +12370,7 @@ void ExecutionTest::runCoopVecMulSubtest( { D3D12_DESCRIPTOR_HEAP_DESC Desc = {}; Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; - Desc.NumDescriptors = 4; + Desc.NumDescriptors = 3 + Config.NumLayers; Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; VERIFY_SUCCEEDED( D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap))); @@ -12281,106 +12378,35 @@ void ExecutionTest::runCoopVecMulSubtest( CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle( DescriptorHeap->GetCPUDescriptorHandleForHeapStart()); - // Setup input data - auto ExpectedOutputBuffer = - std::make_unique(Config.OutputPerThread * Config.NumThreads); - - std::vector InputMatrix; - if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // Matrix source data is fp32, which gets converted to fp16 during matrix - // conversion - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported matrix data type"); - return; - } + // Our input matrix is really a set of row vectors, which we can represent + // as a TestVector. + std::vector<::CoopVecHelpers::TestVector> InputMatrices; + for (int I = 0; I < Config.NumLayers - 1; ++I) { + // Each layer except the last is InputPerThread x InputPerThread + InputMatrices.push_back( + ::CoopVecHelpers::TestVector::createAllOnesTestMatrix( + Config.InputPerThread, Config.InputPerThread, + MulProps.MatrixInterpretation)); + } + // Last layer, matrix size is OutputPerThread x InputPerThread + InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix( + Config.OutputPerThread, Config.InputPerThread, + MulProps.MatrixInterpretation)); auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector( Config.NumThreads, Config.InputPerThread, MulProps.InputType, MulProps.InputInterpretation); auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector( - 1, Config.OutputPerThread, MulProps.BiasInterpretation, - MulProps.BiasInterpretation); + 1, std::max(Config.OutputPerThread, Config.InputPerThread), + MulProps.BiasInterpretation, MulProps.BiasInterpretation); // Calculate reference output - // FIXME: This does not capture all cases, but is sufficient for the preview - // feature set - if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - int32_t *InputBiasI32 = InputBias.getVector(0); - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - int Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - int InputElem; - if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputElem = (int)InputVector.getVector(ThreadIdx)[InputIdx]; - } else { - InputElem = InputVector.getVector(ThreadIdx)[InputIdx]; - } - int const MatrixElem = - InputMatrix[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += InputBiasI32[OutputIdx]; - } - - float Result = float(Acc); - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // The CPU reference matrix is float - std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); - std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - DirectX::PackedVector::HALF *InputVectorFP16 = - InputVector.getVector(ThreadIdx); - DirectX::PackedVector::HALF *InputBiasFP16 = - InputBias.getVector(0); - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - float Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - float const InputElem = - ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]); - float const MatrixElem = - InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); - } - - float Result = Acc; - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } + auto ExpectedOutput = InputVector; + for (int I = 0; I < Config.NumLayers; ++I) { + ExpectedOutput = ::CoopVecHelpers::TestVector::matrixVectorMultiply( + InputMatrices[I], ExpectedOutput, InputBias, Config.Bias, + MulProps.MatrixInterpretation, + I == 0 ? MulProps.InputType : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32); } // Create the compute pipeline state for the CoopVec shader @@ -12391,7 +12417,7 @@ void ExecutionTest::runCoopVecMulSubtest( ByteAddressBuffer InputVector : register(t0); ByteAddressBuffer InputBias : register(t1); -ByteAddressBuffer InputMatrix : register(t2); +ByteAddressBuffer InputMatrix[NUM_LAYERS] : register(t2); RWByteAddressBuffer OutputBuffer: register(u0); [shader("compute")] @@ -12402,25 +12428,57 @@ void main(uint threadIdx : SV_GroupThreadID) uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE); vector input = InputVector.Load >(inputOffset); + VectorRef biasVec = { InputBias, 0 }; + + vector output; +)"; + + if (Config.NumLayers == 1) { + ShaderSource += R"( + MatrixRef mat = { InputMatrix[0], 0, STRIDE }; + + if (USE_BIAS) { + output = MulAdd(mat, MakeInterpretedVector(input), biasVec); + } else { + output = Mul(mat, MakeInterpretedVector(input)); + } +)"; + } else if (Config.NumLayers == 2) { + ShaderSource += R"( + vector accum; - MatrixRef mat = { InputMatrix, 0, STRIDE }; + MatrixRef mat0 = { InputMatrix[0], 0, STRIDE }; + if (USE_BIAS) { + accum = MulAdd(mat0, MakeInterpretedVector(input), biasVec); + //accum = Mul(mat0, MakeInterpretedVector(input)); + } else { + accum = Mul(mat0, MakeInterpretedVector(input)); + } - vector accum; + // Dummy activation function; all of our intermediates are positive (currently). + accum = max(accum, 0); + MatrixRef mat1 = { InputMatrix[1], 0, STRIDE }; if (USE_BIAS) { - VectorRef biasVec = { InputBias, 0 }; - accum = MulAdd(mat, MakeInterpretedVector(input), biasVec); + output = MulAdd(mat1, MakeInterpretedVector(accum), biasVec); } else { - accum = Mul(mat, MakeInterpretedVector(input)); + output = Mul(mat1, MakeInterpretedVector(accum)); } +)"; + } - vector result = (vector)accum; + ShaderSource += R"( + vector result = (vector)output; // Ensure 4-byte alignment for vector store uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float); OutputBuffer.Store >(outputOffset, result); } - )"; +)"; + +#if 0 + printf("%s\n", ShaderSource.c_str()); +#endif auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { std::wstringstream Stream; @@ -12462,7 +12520,7 @@ void main(uint threadIdx : SV_GroupThreadID) const std::wstring InputInterpretationEnum = CoopVecHelpers::GetHlslInterpretationForDataType( MulProps.InputInterpretation); - const std::wstring AccumInterpretationEnum = + const std::wstring BiasInterpretationEnum = CoopVecHelpers::GetHlslInterpretationForDataType( MulProps.BiasInterpretation); @@ -12487,10 +12545,15 @@ void main(uint threadIdx : SV_GroupThreadID) auto MatrixDataTypeEnumDefine = CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum); auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0); + // Treat the accumulator interpretation the same as the input interpretation + // for the purposes of MakeInterpretedVector. auto AccumInterpretationEnumDefine = CreateDefineFromString( - L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum); + L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum); auto InputVectorStrideDefine = CreateDefineFromInt( L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride()); + auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers); + auto BiasInterpretationEnumDefine = CreateDefineFromString( + L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum); LPCWSTR Options[] = { L"-enable-16bit-types", @@ -12507,8 +12570,18 @@ void main(uint threadIdx : SV_GroupThreadID) UseBiasDefine.c_str(), AccumInterpretationEnumDefine.c_str(), InputVectorStrideDefine.c_str(), + NumLayersDefine.c_str(), + BiasInterpretationEnumDefine.c_str(), }; +#if 0 + // Print options for debugging + WEX::Logging::Log::Comment(L"Shader compilation options:"); + for (UINT i = 0; i < _countof(Options); i++) { + WEX::Logging::Log::Comment(Options[i]); + } +#endif + CComPtr IncludeHandler = new LinAlgHeaderIncludeHandler(m_support); @@ -12531,14 +12604,17 @@ void main(uint threadIdx : SV_GroupThreadID) 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, IID_PPV_ARGS(&CommandList))); - // Setup input matrix as all-ones in sint8 format. This will later be - // converted to the appropriate data type by the matrix conversion API. - CComPtr InputMatrixSRVResource, InputMatrixSRVUploadResource; - - CreateTestResources(D3DDevice, CommandList, InputMatrix.data(), - InputMatrix.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()), - &InputMatrixSRVResource, &InputMatrixSRVUploadResource); + std::vector> InputMatrixSRVResources( + Config.NumLayers); + std::vector> InputMatrixSRVUploadResources( + Config.NumLayers); + for (int I = 0; I < Config.NumLayers; ++I) { + CreateTestResources( + D3DDevice, CommandList, InputMatrices[I].getBuffer(), + InputMatrices[I].getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputMatrices[I].getTotalBytes()), + &InputMatrixSRVResources[I], &InputMatrixSRVUploadResources[I]); + } // Create input vector of an appropriate type. All integer types start as // SINT8 for now. @@ -12568,82 +12644,25 @@ void main(uint threadIdx : SV_GroupThreadID) (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)), InputBiasSRVResource); - CComPtr ConvertedMatrixResource; - { - // Create source matrix info - D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {}; - ConvertInfo.SrcInfo.SrcDataType = - CoopVecHelpers::GetMatrixSrcDataType(MulProps.MatrixInterpretation); - ConvertInfo.SrcInfo.SrcLayout = - D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; - - // Create destination matrix info - ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver - int SrcEltSize = 0; - int DestEltSize = 0; - switch (MulProps.MatrixInterpretation) { - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED: - ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8; - SrcEltSize = 1; - DestEltSize = 1; - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: - ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16; - SrcEltSize = 4; // FP32 - DestEltSize = 2; // FP16 - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: - ConvertInfo.DestInfo.DestDataType = - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3; - SrcEltSize = 4; // FP32 - DestEltSize = 1; // FP8 - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: - ConvertInfo.DestInfo.DestDataType = - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2; - SrcEltSize = 4; // FP32 - DestEltSize = 1; // FP8 - break; - } - ConvertInfo.SrcInfo.SrcStride = Config.InputPerThread * SrcEltSize; - ConvertInfo.SrcInfo.SrcSize = - Config.InputPerThread * Config.OutputPerThread * SrcEltSize; - - ConvertInfo.DestInfo.DestLayout = Config.MatrixLayout; - ConvertInfo.DestInfo.DestStride = 0; - ConvertInfo.DestInfo.NumRows = Config.OutputPerThread; - ConvertInfo.DestInfo.NumColumns = Config.InputPerThread; - - if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { - ConvertInfo.DestInfo.DestStride = Config.InputPerThread * DestEltSize; - } else if (Config.MatrixLayout == - D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { - ConvertInfo.DestInfo.DestStride = Config.OutputPerThread * DestEltSize; - } - - // Get destination size using preview interface - { - CComPtr PreviewDevice; - VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview), - (void **)&PreviewDevice)); - - // Query required destination size - PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo( - &ConvertInfo.DestInfo); - } + // Create converted matrix resource and SRV for each input matrix + std::vector> ConvertedMatrixResources( + Config.NumLayers); + for (int I = 0; I < Config.NumLayers; ++I) { + auto ConvertInfo = InputMatrices[I].getConversionInfo( + D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout); int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16; // Create resource to hold matrix copy CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize, CD3DX12_RESOURCE_DESC::Buffer(SRVSize), - &ConvertedMatrixResource, nullptr); + &ConvertedMatrixResources[I], nullptr); // Set up data descriptors ConvertInfo.DataDesc.DestVA = - ConvertedMatrixResource->GetGPUVirtualAddress(); - ConvertInfo.DataDesc.SrcVA = InputMatrixSRVResource->GetGPUVirtualAddress(); + ConvertedMatrixResources[I]->GetGPUVirtualAddress(); + ConvertInfo.DataDesc.SrcVA = + InputMatrixSRVResources[I]->GetGPUVirtualAddress(); // Get command list interface and perform conversion CComPtr CommandList11; @@ -12651,8 +12670,9 @@ void main(uint threadIdx : SV_GroupThreadID) __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11)); CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); + // This increments BaseHandle CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t), - ConvertedMatrixResource); + ConvertedMatrixResources[I]); } CComPtr UavResource; @@ -12697,14 +12717,18 @@ void main(uint threadIdx : SV_GroupThreadID) float *ResultBuffer = (float *)MappedData.data(); bool Equal = true; - for (int i = 0; i < OutputBufferSize / sizeof(float); i++) { - if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) || - fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) { - LogErrorFmt(L"Result mismatch at index %d", i); - LogErrorFmt(L"ResultBuffer[%d]: %f, ExpectedOutputBuffer[%d]: %f", i, - ResultBuffer[i], i, ExpectedOutputBuffer[i]); - Equal = false; - break; + + for (int i = 0; i < Config.NumThreads; ++i) { + for (int j = 0; j < Config.OutputPerThread; ++j) { + float Result = ResultBuffer[i * Config.OutputPerThread + j]; + float Expected = ExpectedOutput.getVector(i)[j]; + if (isnan(Result) || isnan(Expected) || + fabs(Result - Expected) > 0.00001) { + LogErrorFmt(L"Result mismatch at index %d", + i * Config.OutputPerThread + j); + LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected); + Equal = false; + } } } VERIFY_IS_TRUE(Equal); From a67edd490b2844ba259a74042305b564c7f4a870 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 13:15:01 -0400 Subject: [PATCH 4/6] Remove dead code --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index a613f28139..3d69815034 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12476,10 +12476,6 @@ void main(uint threadIdx : SV_GroupThreadID) } )"; -#if 0 - printf("%s\n", ShaderSource.c_str()); -#endif - auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { std::wstringstream Stream; Stream << L"-D" << Name << L"=" << Value; @@ -12574,14 +12570,6 @@ void main(uint threadIdx : SV_GroupThreadID) BiasInterpretationEnumDefine.c_str(), }; -#if 0 - // Print options for debugging - WEX::Logging::Log::Comment(L"Shader compilation options:"); - for (UINT i = 0; i < _countof(Options); i++) { - WEX::Logging::Log::Comment(Options[i]); - } -#endif - CComPtr IncludeHandler = new LinAlgHeaderIncludeHandler(m_support); From b2d35a973156b4b7a1948c520f4b6a0e2c15b6b5 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Thu, 8 May 2025 07:25:28 -0400 Subject: [PATCH 5/6] Remove dead line --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 3d69815034..ef769b12f7 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12450,7 +12450,6 @@ void main(uint threadIdx : SV_GroupThreadID) MatrixRef mat0 = { InputMatrix[0], 0, STRIDE }; if (USE_BIAS) { accum = MulAdd(mat0, MakeInterpretedVector(input), biasVec); - //accum = Mul(mat0, MakeInterpretedVector(input)); } else { accum = Mul(mat0, MakeInterpretedVector(input)); } From 11f3b6de20e9f574747d313b2c1c94ba4057b7e6 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Thu, 8 May 2025 07:27:07 -0400 Subject: [PATCH 6/6] Add comment about ambiguous IsFP32 flag --- tools/clang/unittests/HLSLExec/CoopVec.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index b5c0a2f355..18b8669197 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -749,12 +749,13 @@ struct TestVector { const TestVector &Bias, bool HasBias, D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation, D3D12_LINEAR_ALGEBRA_DATATYPE InputType) { - bool IsFP32 = false; + // The CPU reference matrix is FP32 for all FP interpretations. + bool IsMatrixFP32 = false; switch (MatrixInterpretation) { case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: - IsFP32 = true; + IsMatrixFP32 = true; break; default: break; @@ -763,7 +764,7 @@ struct TestVector { TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(), sizeof(float)); - if (IsFP32) { + if (IsMatrixFP32) { for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { const DirectX::PackedVector::HALF *InputBiasFP16 = Bias.getVector(0);