From d59ea37784bb386615abef7fce80de2272b25a27 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 6 May 2025 14:40:37 -0400 Subject: [PATCH 01/26] Clean up vector handling code by introducing TestVector --- tools/clang/unittests/HLSLExec/CoopVec.h | 200 +++++++ .../unittests/HLSLExec/ExecutionTest.cpp | 532 +++++++----------- 2 files changed, 416 insertions(+), 316 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index f166c61f67..cd24a556bd 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -4,6 +4,8 @@ #include #include + +#include #include #include "dxc/Support/microcom.h" @@ -61,6 +63,7 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler { }; namespace CoopVecHelpers { + template static std::vector CreateAllOnesInputMatrix(uint32_t Width, uint32_t Height) { @@ -354,6 +357,203 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) { return D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32; } } + +struct TestVector { +private: + size_t NumVectors = 0; + size_t VectorSize = 0; + size_t ElementSize = 0; + size_t Stride = 0; + size_t TotalBytes = 0; + uint8_t *Buffer = nullptr; + +public: + TestVector(size_t NumVectors, size_t VectorSize, size_t ElementSize, + size_t Alignment = 16) + : NumVectors(NumVectors), VectorSize(VectorSize), + ElementSize(ElementSize) { + if (NumVectors == 0) { + throw std::invalid_argument("NumVectors must be greater than 0"); + } + if (VectorSize == 0) { + throw std::invalid_argument("VectorSize must be greater than 0"); + } + if (ElementSize == 0) { + throw std::invalid_argument("ElementSize must be greater than 0"); + } + + size_t VectorBytes = VectorSize * ElementSize; + Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment; + TotalBytes = Stride * NumVectors; + + void *Ptr = nullptr; +#ifdef _MSC_VER + Ptr = _aligned_malloc(TotalBytes, Alignment); +#else + Ptr = std::aligned_alloc(Alignment, TotalBytes); +#endif + Buffer = reinterpret_cast(Ptr); + std::fill(Buffer, Buffer + TotalBytes, (uint8_t)0xFF); + } + + // Copy constructor + TestVector(const TestVector &other) + : NumVectors(other.NumVectors), VectorSize(other.VectorSize), + ElementSize(other.ElementSize), Stride(other.Stride), + TotalBytes(other.TotalBytes) { + + void *Ptr = nullptr; +#ifdef _MSC_VER + Ptr = _aligned_malloc(TotalBytes, 16); +#else + Ptr = std::aligned_alloc(16, TotalBytes); +#endif + Buffer = reinterpret_cast(Ptr); + + if (other.Buffer) { + std::memcpy(Buffer, other.Buffer, TotalBytes); + } + } + + // Move constructor + TestVector(TestVector &&other) noexcept + : NumVectors(other.NumVectors), VectorSize(other.VectorSize), + ElementSize(other.ElementSize), Stride(other.Stride), + TotalBytes(other.TotalBytes), Buffer(other.Buffer) { + + // Reset the source object + other.NumVectors = 0; + other.VectorSize = 0; + other.ElementSize = 0; + other.Stride = 0; + other.TotalBytes = 0; + other.Buffer = nullptr; + } + + ~TestVector() { + if (Buffer) { +#ifdef _MSC_VER + _aligned_free(Buffer); +#else + std::free(Buffer); +#endif + } + } + + size_t getNumVectors() const { return NumVectors; } + size_t getVectorSize() const { return VectorSize; } + size_t getElementSize() const { return ElementSize; } + size_t getStride() const { return Stride; } + size_t getTotalBytes() const { return TotalBytes; } + uint8_t *getBuffer() { return Buffer; } + const uint8_t *getBuffer() const { return Buffer; } + + template T *getVector(size_t I) { + uint8_t *Ptr = Buffer + I * Stride; + return reinterpret_cast(Ptr); + } + + template const T *getVector(size_t I) const { + const uint8_t *Ptr = Buffer + I * Stride; + return reinterpret_cast(Ptr); + } + + template void fill(const T &Value) { + for (size_t I = 0; I < NumVectors; ++I) { + T *Vec = getVector(I); + for (size_t J = 0; J < VectorSize; ++J) + Vec[J] = Value; + } + } + + template void fillSimpleTestData() { + // Create a vector of (1, 1, 0, ...) + for (size_t I = 0; I < NumVectors; ++I) { + T *Vec = getVector(I); + for (size_t J = 0; J < VectorSize; ++J) + if constexpr (std::is_same_v) { + // Special case for HALF, which requires conversion from float + Vec[J] = static_cast( + ConvertFloat32ToFloat16((J == 0 || J == 1) ? 1.0f : 0.0f)); + } else { + Vec[J] = static_cast((J == 0 || J == 1) ? 1 : 0); + } + } + } + + static TestVector + createSimpleTestVector(size_t NumVectors, size_t VectorSize, + D3D12_LINEAR_ALGEBRA_DATATYPE DataType, + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + size_t ElementSize; + switch (DataType) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + ElementSize = sizeof(int8_t); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + ElementSize = sizeof(int16_t); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) { + ElementSize = sizeof(int8_t); + } else { + ElementSize = sizeof(int32_t); + } + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + ElementSize = sizeof(DirectX::PackedVector::HALF); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + ElementSize = sizeof(float); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + TestVector Vec(NumVectors, VectorSize, ElementSize); + switch (DataType) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) { + Vec.fillSimpleTestData(); + } else { + Vec.fillSimpleTestData(); + } + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + Vec.fillSimpleTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + Vec.fillSimpleTestData(); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + return Vec; + } +}; }; // namespace CoopVecHelpers #endif // HAVE_COOPVEC_API diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 55d569dd8d..f47b4624d6 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12241,6 +12241,112 @@ void ExecutionTest::runCoopVecMulSubtest( CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle( DescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + // Setup input data + auto ExpectedOutputBuffer = + std::make_unique(Config.OutputPerThread * Config.NumThreads); + + std::vector InputMatrix; + if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || + MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( + Config.InputPerThread, Config.OutputPerThread); + } else if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { + // Matrix source data is fp32, which gets converted to fp16 during matrix + // conversion + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( + Config.InputPerThread, Config.OutputPerThread); + } else { + WEX::Logging::Log::Error(L"Unsupported matrix data type"); + return; + } + + auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector( + Config.NumThreads, Config.InputPerThread, MulProps.InputType, + MulProps.InputInterpretation); + auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector( + 1, Config.OutputPerThread, MulProps.BiasInterpretation, + MulProps.BiasInterpretation); + + // Calculate reference output + // FIXME: This does not capture all cases, but is sufficient for the preview + // feature set + if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { + int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer(); + float *InputVectorF32 = (float *)InputVector.getBuffer(); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { + int Acc = 0; + + for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { + int InputElem; + if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + InputElem = (int) + InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx]; + } else { + InputElem = InputVector.getVector(ThreadIdx)[InputIdx]; + } + int const MatrixElem = + InputMatrix[OutputIdx * Config.InputPerThread + InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (Config.Bias) { + Acc += InputBiasI32[OutputIdx]; + } + + float Result = float(Acc); + ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = + Result; + } + } + } else if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || + MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { + DirectX::PackedVector::HALF *InputVectorFP16 = + (DirectX::PackedVector::HALF *)InputVector.getBuffer(); + DirectX::PackedVector::HALF *InputBiasFP16 = + (DirectX::PackedVector::HALF *)InputBias.getBuffer(); + + // The CPU reference matrix is float + std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); + std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { + float Acc = 0; + + for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { + float const InputElem = ConvertFloat16ToFloat32( + InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]); + float const MatrixElem = + InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (Config.Bias) { + Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); + } + + float Result = Acc; + ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = + Result; + } + } + } + // Create the compute pipeline state for the CoopVec shader CComPtr ComputePipelineState; { @@ -12258,9 +12364,7 @@ void main(uint threadIdx : SV_GroupThreadID) { using namespace dx::linalg; - // Ensure 4-byte alignment for vector loads - uint inputOffset = (INPUT_PER_THREAD * threadIdx * (sizeof(INPUT_DATA_TYPE) / INPUT_DIVISOR)); - inputOffset = (inputOffset + 3) & ~3; // Align to 4 bytes + uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE); vector input = InputVector.Load >(inputOffset); MatrixRef mat = { InputMatrix, 0, STRIDE }; @@ -12278,7 +12382,6 @@ void main(uint threadIdx : SV_GroupThreadID) // Ensure 4-byte alignment for vector store uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float); - outputOffset = (outputOffset + 3) & ~3; // Align to 4 bytes OutputBuffer.Store >(outputOffset, result); } )"; @@ -12349,6 +12452,8 @@ void main(uint threadIdx : SV_GroupThreadID) auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0); auto AccumInterpretationEnumDefine = CreateDefineFromString( L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum); + auto InputVectorStrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride()); LPCWSTR Options[] = { L"-enable-16bit-types", @@ -12364,6 +12469,7 @@ void main(uint threadIdx : SV_GroupThreadID) MatrixDataTypeEnumDefine.c_str(), UseBiasDefine.c_str(), AccumInterpretationEnumDefine.c_str(), + InputVectorStrideDefine.c_str(), }; CComPtr IncludeHandler = @@ -12388,36 +12494,9 @@ void main(uint threadIdx : SV_GroupThreadID) 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, IID_PPV_ARGS(&CommandList))); - // Setup input data - auto ExpectedOutputBuffer = - std::make_unique(Config.OutputPerThread * Config.NumThreads); - // Setup input matrix as all-ones in sint8 format. This will later be // converted to the appropriate data type by the matrix conversion API. CComPtr InputMatrixSRVResource, InputMatrixSRVUploadResource; - std::vector InputMatrix; - if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // Matrix source data is fp32, which gets converted to fp16 during matrix - // conversion - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported matrix data type"); - return; - } CreateTestResources(D3DDevice, CommandList, InputMatrix.data(), InputMatrix.size(), @@ -12427,180 +12506,31 @@ void main(uint threadIdx : SV_GroupThreadID) // Create input vector of an appropriate type. All integer types start as // SINT8 for now. CComPtr InputVecSRVResource, InputVecSRVUploadResource; - std::vector InputVector; - - if ((MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32 && - (MulProps.InputInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.InputInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED)) || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - InputVector = - CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) { - InputVector = CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.InputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported input data type"); - return; - } - if (InputVector.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputVector.resize(InputVector.size() + 4 - (InputVector.size() % 4)); - } - CreateTestResources(D3DDevice, CommandList, InputVector.data(), - InputVector.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputVector.size()), - &InputVecSRVResource, &InputVecSRVUploadResource); + + CreateTestResources( + D3DDevice, CommandList, InputVector.getBuffer(), + InputVector.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputVector.getTotalBytes()), + &InputVecSRVResource, &InputVecSRVUploadResource); // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector.size() / sizeof(int32_t)), + (UINT)(InputVector.getTotalBytes() / sizeof(int32_t)), InputVecSRVResource); // Create input bias CComPtr InputBiasSRVResource, InputBiasSRVUploadResource; - std::vector InputBias; - if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || - MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputBias = CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) { - InputBias = - CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) { - InputBias = - CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { - InputBias = CoopVecHelpers::CreateInputBias( - Config.OutputPerThread); - } else if (MulProps.BiasInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputBias = CoopVecHelpers::CreateInputBias(Config.OutputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported bias data type"); - return; - } - - if (InputBias.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputBias.resize(InputBias.size() + 4 - (InputBias.size() % 4)); - } - CreateTestResources(D3DDevice, CommandList, InputBias.data(), - InputBias.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputBias.size()), + CreateTestResources(D3DDevice, CommandList, InputBias.getBuffer(), + InputBias.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputBias.getTotalBytes()), &InputBiasSRVResource, &InputBiasSRVUploadResource); // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputBias.size() / sizeof(int32_t)), + (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)), InputBiasSRVResource); - // Calculate reference output - // FIXME: This does not capture all cases, but is sufficient for the preview - // feature set - if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { - // The input bias is really an array of int32_t - std::vector InputBiasI32(InputBias.size() / sizeof(int32_t)); - std::memcpy(InputBiasI32.data(), InputBias.data(), InputBias.size()); - - // The input vector is really an array of float if our vector input type is - // FLOAT32 - std::vector InputVectorF32(InputVector.size() / sizeof(int32_t)); - if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - std::memcpy(InputVectorF32.data(), InputVector.data(), - InputVector.size()); - } - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - int Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - int InputElem; - if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputElem = (int) - InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx]; - } else { - InputElem = - InputVector[ThreadIdx * Config.InputPerThread + InputIdx]; - } - int const MatrixElem = - InputMatrix[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += InputBiasI32[OutputIdx]; - } - - float Result = float(Acc); - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // The input bias/vector is really an array of float16 - std::vector InputVectorFP16( - InputVector.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputVectorFP16.data(), InputVector.data(), InputVector.size()); - - std::vector InputBiasFP16( - InputBias.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputBiasFP16.data(), InputBias.data(), InputBias.size()); - - // The CPU reference matrix is float - std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); - std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - float Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - float const InputElem = ConvertFloat16ToFloat32( - InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]); - float const MatrixElem = - InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); - } - - float Result = Acc; - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } - } - CComPtr ConvertedMatrixResource; { // Create source matrix info @@ -12862,6 +12792,80 @@ void ExecutionTest::runCoopVecOuterProductSubtest( CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle( DescriptorHeap->GetCPUDescriptorHandleForHeapStart()); + // Setup input matrix as all-ones in sint8/fp32 format. This will later be + // converted to the appropriate data type by the matrix conversion API. + + std::vector InputMatrix; + if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, + Config.DimM); + } else if (AccumulateProps.AccumulationType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || + AccumulateProps.AccumulationType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || + AccumulateProps.AccumulationType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { + // Matrix source data is fp32, which gets converted to fp16 during matrix + // conversion + InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, + Config.DimM); + } else { + WEX::Logging::Log::Error(L"Unsupported matrix data type"); + return; + } + + // Create input vectors + auto InputVector1 = CoopVecHelpers::TestVector::createSimpleTestVector( + Config.NumThreads, Config.DimM, AccumulateProps.InputType, + AccumulateProps.InputType); + auto InputVector2 = CoopVecHelpers::TestVector::createSimpleTestVector( + Config.NumThreads, Config.DimN, AccumulateProps.InputType, + AccumulateProps.InputType); + + // Calculate reference output + auto ExpectedOutputBufferI8 = + CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, Config.DimM); + std::vector ExpectedOutputBuffer(ExpectedOutputBufferI8.size() / + sizeof(float)); + std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(), + ExpectedOutputBufferI8.size()); + + if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { + DirectX::PackedVector::HALF *InputVector1FP16 = + reinterpret_cast( + InputVector1.getBuffer()); + DirectX::PackedVector::HALF *InputVector2FP16 = + reinterpret_cast( + InputVector2.getBuffer()); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int M = 0; M < Config.DimM; ++M) { + for (int N = 0; N < Config.DimN; ++N) { + float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) * + ConvertFloat16ToFloat32(InputVector2FP16[N]); + ExpectedOutputBuffer[M * Config.DimN + N] += acc; + } + } + } + } else if (AccumulateProps.InputType == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + float *InputVector1FP32 = + reinterpret_cast(InputVector1.getBuffer()); + float *InputVector2FP32 = + reinterpret_cast(InputVector2.getBuffer()); + + for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (int M = 0; M < Config.DimM; ++M) { + for (int N = 0; N < Config.DimN; ++N) { + float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] * + InputVector2FP32[ThreadIdx * Config.DimN + N]; + ExpectedOutputBuffer[M * Config.DimN + N] += Acc; + } + } + } + } + // Create a compute pipeline state object. CComPtr ComputePipelineState; { @@ -12880,12 +12884,10 @@ void main(uint threadIdx : SV_GroupThreadID) using namespace dx::linalg; // Ensure 4-byte alignment for vector loads - uint inputOffset1 = (DIM_M * threadIdx * sizeof(INPUT_DATA_TYPE)); - inputOffset1 = (inputOffset1 + 3) & ~3; // Align to 4 bytes + uint inputOffset1 = threadIdx * INPUT_VECTOR_1_STRIDE; vector input1 = InputVector1.Load >(inputOffset1); - uint inputOffset2 = (DIM_N * threadIdx * sizeof(INPUT_DATA_TYPE)); - inputOffset2 = (inputOffset2 + 3) & ~3; // Align to 4 bytes + uint inputOffset2 = threadIdx * INPUT_VECTOR_2_STRIDE; vector input2 = InputVector2.Load >(inputOffset2); RWMatrixRef mat = { AccumMatrix, 0, STRIDE }; @@ -12954,6 +12956,10 @@ void main(uint threadIdx : SV_GroupThreadID) CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str()); auto MatrixDataTypeEnumDefine = CreateDefineFromString( L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str()); + auto InputVector1StrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride()); + auto InputVector2StrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride()); LPCWSTR Options[] = { L"-enable-16bit-types", @@ -12967,6 +12973,8 @@ void main(uint threadIdx : SV_GroupThreadID) InputInterpretationEnumDefine.c_str(), HlslMatrixLayoutDefine.c_str(), MatrixDataTypeEnumDefine.c_str(), + InputVector1StrideDefine.c_str(), + InputVector2StrideDefine.c_str(), }; CComPtr IncludeHandler = @@ -12991,142 +12999,34 @@ void main(uint threadIdx : SV_GroupThreadID) 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, IID_PPV_ARGS(&CommandList))); - // Setup input matrix as all-ones in sint8/fp32 format. This will later be - // converted to the appropriate data type by the matrix conversion API. CComPtr InputMatrixSRVResource, InputMatrixSRVUploadResource; - std::vector InputMatrix; - if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, - Config.DimM); - } else if (AccumulateProps.AccumulationType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - AccumulateProps.AccumulationType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - AccumulateProps.AccumulationType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // Matrix source data is fp32, which gets converted to fp16 during matrix - // conversion - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, - Config.DimM); - } else { - WEX::Logging::Log::Error(L"Unsupported matrix data type"); - return; - } - CreateTestResources(D3DDevice, CommandList, InputMatrix.data(), InputMatrix.size(), CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()), &InputMatrixSRVResource, &InputMatrixSRVUploadResource); - // Create input vectors CComPtr InputVecSRVResource1, InputVecSRVUploadResource1; - std::vector InputVector1; CComPtr InputVecSRVResource2, InputVecSRVUploadResource2; - std::vector InputVector2; - - if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputVector1 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimM); - InputVector2 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimN); - } else if (AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - InputVector1 = - CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.DimM); - InputVector2 = - CoopVecHelpers::CreateInputVector( - Config.NumThreads, Config.DimN); - } else if (AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputVector1 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimM); - InputVector2 = CoopVecHelpers::CreateInputVector(Config.NumThreads, - Config.DimN); - } else { - WEX::Logging::Log::Error(L"Unsupported input data type"); - return; - } - if (InputVector1.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputVector1.resize(InputVector1.size() + 4 - (InputVector1.size() % 4)); - } - if (InputVector2.size() % 4 != 0) { - // Align size to 4 bytes for ByteAddressBuffer - InputVector2.resize(InputVector2.size() + 4 - (InputVector2.size() % 4)); - } - CreateTestResources(D3DDevice, CommandList, InputVector1.data(), - InputVector1.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputVector1.size()), - &InputVecSRVResource1, &InputVecSRVUploadResource1); - CreateTestResources(D3DDevice, CommandList, InputVector2.data(), - InputVector2.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputVector2.size()), - &InputVecSRVResource2, &InputVecSRVUploadResource2); + + CreateTestResources( + D3DDevice, CommandList, InputVector1.getBuffer(), + InputVector1.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputVector1.getTotalBytes()), + &InputVecSRVResource1, &InputVecSRVUploadResource1); + CreateTestResources( + D3DDevice, CommandList, InputVector2.getBuffer(), + InputVector2.getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputVector2.getTotalBytes()), + &InputVecSRVResource2, &InputVecSRVUploadResource2); // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector1.size() / sizeof(int32_t)), + (UINT)(InputVector1.getTotalBytes() / sizeof(int32_t)), InputVecSRVResource1); CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector2.size() / sizeof(int32_t)), + (UINT)(InputVector2.getTotalBytes() / sizeof(int32_t)), InputVecSRVResource2); - // Calculate reference output - auto ExpectedOutputBufferI8 = - CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, Config.DimM); - std::vector ExpectedOutputBuffer(ExpectedOutputBufferI8.size() / - sizeof(float)); - std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(), - ExpectedOutputBufferI8.size()); - - if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { - std::vector InputVector1FP16( - InputVector1.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputVector1FP16.data(), InputVector1.data(), - InputVector1.size()); - - std::vector InputVector2FP16( - InputVector2.size() / sizeof(DirectX::PackedVector::HALF)); - std::memcpy(InputVector2FP16.data(), InputVector2.data(), - InputVector2.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int M = 0; M < Config.DimM; ++M) { - for (int N = 0; N < Config.DimN; ++N) { - float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) * - ConvertFloat16ToFloat32(InputVector2FP16[N]); - ExpectedOutputBuffer[M * Config.DimN + N] += acc; - } - } - } - } else if (AccumulateProps.InputType == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - std::vector InputVector1FP32(InputVector1.size() / sizeof(float)); - std::memcpy(InputVector1FP32.data(), InputVector1.data(), - InputVector1.size()); - - std::vector InputVector2FP32(InputVector2.size() / sizeof(float)); - std::memcpy(InputVector2FP32.data(), InputVector2.data(), - InputVector2.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - for (int M = 0; M < Config.DimM; ++M) { - for (int N = 0; N < Config.DimN; ++N) { - float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] * - InputVector2FP32[ThreadIdx * Config.DimN + N]; - ExpectedOutputBuffer[M * Config.DimN + N] += Acc; - } - } - } - } - CComPtr ConvertedMatrixResource, ConvertedMatrixReadResource; int ConvertedMatrixSize = 0; { From 91b2c7613dc380006e2aead1d8f0a451769bc833 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 6 May 2025 16:17:37 -0400 Subject: [PATCH 02/26] Support odd matrix/vector sizes --- .../unittests/HLSLExec/ExecutionTest.cpp | 84 +++++++++++++------ 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index f47b4624d6..934210af1f 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12149,6 +12149,14 @@ void ExecutionTest::runCoopVecMulTestConfig( {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, @@ -12157,6 +12165,14 @@ void ExecutionTest::runCoopVecMulTestConfig( {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, @@ -12165,6 +12181,14 @@ void ExecutionTest::runCoopVecMulTestConfig( {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, false}, {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, @@ -12181,6 +12205,22 @@ void ExecutionTest::runCoopVecMulTestConfig( false}, {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, }; for (auto Config : TestConfigs) { @@ -12280,18 +12320,15 @@ void ExecutionTest::runCoopVecMulSubtest( // FIXME: This does not capture all cases, but is sufficient for the preview // feature set if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { - int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer(); - float *InputVectorF32 = (float *)InputVector.getBuffer(); - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + int32_t *InputBiasI32 = InputBias.getVector(0); for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { int Acc = 0; for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { int InputElem; if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputElem = (int) - InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx]; + InputElem = (int)InputVector.getVector(ThreadIdx)[InputIdx]; } else { InputElem = InputVector.getVector(ThreadIdx)[InputIdx]; } @@ -12315,22 +12352,21 @@ void ExecutionTest::runCoopVecMulSubtest( D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - DirectX::PackedVector::HALF *InputVectorFP16 = - (DirectX::PackedVector::HALF *)InputVector.getBuffer(); - DirectX::PackedVector::HALF *InputBiasFP16 = - (DirectX::PackedVector::HALF *)InputBias.getBuffer(); - // The CPU reference matrix is float std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + DirectX::PackedVector::HALF *InputVectorFP16 = + InputVector.getVector(ThreadIdx); + DirectX::PackedVector::HALF *InputBiasFP16 = + InputBias.getVector(0); for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { float Acc = 0; for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - float const InputElem = ConvertFloat16ToFloat32( - InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]); + float const InputElem = + ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]); float const MatrixElem = InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; Acc += InputElem * MatrixElem; @@ -12365,7 +12401,7 @@ void main(uint threadIdx : SV_GroupThreadID) using namespace dx::linalg; uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE); - vector input = InputVector.Load >(inputOffset); + vector input = InputVector.Load >(inputOffset); MatrixRef mat = { InputMatrix, 0, STRIDE }; @@ -12439,8 +12475,9 @@ void main(uint threadIdx : SV_GroupThreadID) auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); auto InputDataTypeDefine = CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType); - auto InputDivisorDefine = - CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor); + auto InputDivisorDefine = CreateDefineFromInt( + L"INPUT_VECTOR_NUM_ELEMENTS", + (Config.InputPerThread + InputDivisor - 1) / InputDivisor); auto AccumDataTypeDefine = CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType); auto InputInterpretationEnumDefine = CreateDefineFromString( @@ -12596,11 +12633,12 @@ void main(uint threadIdx : SV_GroupThreadID) &ConvertInfo.DestInfo); } + int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16; + // Create resource to hold matrix copy - CreateTestResources( - D3DDevice, CommandList, nullptr, 0, - CD3DX12_RESOURCE_DESC::Buffer(ConvertInfo.DestInfo.DestSize), - &ConvertedMatrixResource, nullptr); + CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize, + CD3DX12_RESOURCE_DESC::Buffer(SRVSize), + &ConvertedMatrixResource, nullptr); // Set up data descriptors ConvertInfo.DataDesc.DestVA = @@ -12613,13 +12651,7 @@ void main(uint threadIdx : SV_GroupThreadID) __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11)); CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); - // This increments baseHandle - if ((ConvertInfo.DestInfo.DestSize % 4) != 0) { - WEX::Logging::Log::Error(L"DestSize is not aligned to 4 bytes"); - return; - } - CreateRawSRV(D3DDevice, BaseHandle, - ConvertInfo.DestInfo.DestSize / sizeof(int32_t), + CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t), ConvertedMatrixResource); } From ce38c677a104ae3e5c165dd91b01cf92964f6a01 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 13:10:59 -0400 Subject: [PATCH 03/26] Finish support for NumLayers=2 --- tools/clang/unittests/HLSLExec/CoopVec.h | 275 ++++++++++++ .../unittests/HLSLExec/ExecutionTest.cpp | 410 +++++++++--------- 2 files changed, 492 insertions(+), 193 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index cd24a556bd..b5c0a2f355 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -448,6 +448,74 @@ struct TestVector { uint8_t *getBuffer() { return Buffer; } const uint8_t *getBuffer() const { return Buffer; } + // Copy assignment operator + TestVector &operator=(const TestVector &other) { + if (this != &other) { + // Free existing buffer + if (Buffer) { +#ifdef _MSC_VER + _aligned_free(Buffer); +#else + std::free(Buffer); +#endif + Buffer = nullptr; + } + + // Copy metadata + NumVectors = other.NumVectors; + VectorSize = other.VectorSize; + ElementSize = other.ElementSize; + Stride = other.Stride; + TotalBytes = other.TotalBytes; + + // Allocate new buffer + void *Ptr = nullptr; +#ifdef _MSC_VER + Ptr = _aligned_malloc(TotalBytes, 16); +#else + Ptr = std::aligned_alloc(16, TotalBytes); +#endif + Buffer = reinterpret_cast(Ptr); + + // Copy data + if (other.Buffer) { + std::memcpy(Buffer, other.Buffer, TotalBytes); + } + } + return *this; + } + + // Move assignment operator + TestVector &operator=(TestVector &&other) noexcept { + if (this != &other) { + // Free existing buffer + if (Buffer) { +#ifdef _MSC_VER + _aligned_free(Buffer); +#else + std::free(Buffer); +#endif + } + + // Move metadata and buffer + NumVectors = other.NumVectors; + VectorSize = other.VectorSize; + ElementSize = other.ElementSize; + Stride = other.Stride; + TotalBytes = other.TotalBytes; + Buffer = other.Buffer; + + // Reset the source object + other.NumVectors = 0; + other.VectorSize = 0; + other.ElementSize = 0; + other.Stride = 0; + other.TotalBytes = 0; + other.Buffer = nullptr; + } + return *this; + } + template T *getVector(size_t I) { uint8_t *Ptr = Buffer + I * Stride; return reinterpret_cast(Ptr); @@ -481,6 +549,20 @@ struct TestVector { } } + template void fillAllOnesTestData() { + // Create a vector of (1, 1, 1, ...) + for (size_t I = 0; I < NumVectors; ++I) { + T *Vec = getVector(I); + for (size_t J = 0; J < VectorSize; ++J) + if constexpr (std::is_same_v) { + // Special case for HALF, which requires conversion from float + Vec[J] = static_cast(ConvertFloat32ToFloat16(1.0f)); + } else { + Vec[J] = static_cast(1); + } + } + } + static TestVector createSimpleTestVector(size_t NumVectors, size_t VectorSize, D3D12_LINEAR_ALGEBRA_DATATYPE DataType, @@ -553,6 +635,199 @@ struct TestVector { } return Vec; } + + static TestVector + createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize, + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + size_t ElementSize; + switch (DataInterpretation) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + ElementSize = sizeof(int8_t); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + ElementSize = sizeof(float); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + TestVector Vec(NumVectors, VectorSize, ElementSize); + switch (DataInterpretation) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + Vec.fillAllOnesTestData(); + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + Vec.fillAllOnesTestData(); + break; + default: + throw std::invalid_argument("Unsupported data type"); + } + return Vec; + } + + D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO + getConversionInfo(ID3D12Device *D3DDevice, + D3D12_LINEAR_ALGEBRA_DATATYPE DestDataType, + D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout) { + // Create source matrix info + D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {}; + ConvertInfo.SrcInfo.SrcDataType = + ::CoopVecHelpers::GetMatrixSrcDataType(DestDataType); + ConvertInfo.SrcInfo.SrcLayout = + D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; + + // Create destination matrix info + ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver + int DestEltSize = 0; + switch (DestDataType) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED: + ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8; + DestEltSize = 1; + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16; + DestEltSize = 2; // FP16 + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + ConvertInfo.DestInfo.DestDataType = + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3; + DestEltSize = 1; // FP8 + break; + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + ConvertInfo.DestInfo.DestDataType = + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2; + DestEltSize = 1; // FP8 + break; + } + ConvertInfo.SrcInfo.SrcStride = (UINT)getStride(); + ConvertInfo.SrcInfo.SrcSize = (UINT)getTotalBytes(); + + ConvertInfo.DestInfo.DestLayout = MatrixLayout; + ConvertInfo.DestInfo.DestStride = 0; + ConvertInfo.DestInfo.NumRows = (UINT)getNumVectors(); + ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize(); + + if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { + ConvertInfo.DestInfo.DestStride = (UINT)getVectorSize() * DestEltSize; + } else if (MatrixLayout == + D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { + ConvertInfo.DestInfo.DestStride = (UINT)getNumVectors() * DestEltSize; + } + + // Get destination size using preview interface + { + CComPtr PreviewDevice; + VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview), + (void **)&PreviewDevice)); + + // Query required destination size + PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo( + &ConvertInfo.DestInfo); + } + + return ConvertInfo; + } + + static TestVector + matrixVectorMultiply(const TestVector &Matrix, const TestVector &InputVector, + const TestVector &Bias, bool HasBias, + D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation, + D3D12_LINEAR_ALGEBRA_DATATYPE InputType) { + bool IsFP32 = false; + switch (MatrixInterpretation) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + IsFP32 = true; + break; + default: + break; + } + + TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(), + sizeof(float)); + + if (IsFP32) { + for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { + const DirectX::PackedVector::HALF *InputBiasFP16 = + Bias.getVector(0); + for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); + ++OutputIdx) { + float Acc = 0; + + for (int InputIdx = 0; InputIdx < Matrix.getVectorSize(); + ++InputIdx) { + float InputElem; + if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + InputElem = InputVector.getVector(VecIdx)[InputIdx]; + } else { + InputElem = ConvertFloat16ToFloat32( + InputVector.getVector( + VecIdx)[InputIdx]); + } + float const MatrixElem = + Matrix.getVector(OutputIdx)[InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (HasBias) { + Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); + } + + float Result = Acc; + ResultVec.getVector(VecIdx)[OutputIdx] = Result; + } + } + } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { + for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { + const int32_t *InputBiasI32 = Bias.getVector(0); + for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); + ++OutputIdx) { + int Acc = 0; + + for (int InputIdx = 0; InputIdx < Matrix.getVectorSize(); + ++InputIdx) { + int InputElem; + if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + InputElem = (int)InputVector.getVector(VecIdx)[InputIdx]; + } else { + InputElem = InputVector.getVector(VecIdx)[InputIdx]; + } + int const MatrixElem = + Matrix.getVector(OutputIdx)[InputIdx]; + Acc += InputElem * MatrixElem; + } + + if (HasBias) { + Acc += InputBiasI32[OutputIdx]; + } + + float Result = float(Acc); + ResultVec.getVector(VecIdx)[OutputIdx] = Result; + } + } + } else { + throw std::invalid_argument("Unsupported matrix interpretation"); + } + + return ResultVec; + } }; }; // namespace CoopVecHelpers diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 934210af1f..a613f28139 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -789,7 +789,7 @@ class ExecutionTest { int InputPerThread; int OutputPerThread; int NumThreads; - int NumLevels; + int NumLayers; D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout; bool Bias; }; @@ -12221,6 +12221,88 @@ void ExecutionTest::runCoopVecMulTestConfig( false}, {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, true}, + + // NumLayers=2 tests + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + false}, + {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL, + true}, }; for (auto Config : TestConfigs) { @@ -12234,6 +12316,21 @@ void ExecutionTest::runCoopVecMulTestConfig( continue; } + if (Config.NumLayers > 1 && + (MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 || + MulProps.InputInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + MulProps.InputInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) && + (MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 || + MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32)) { + // We do not support multi-layer tests with packed types as input with + // full-precision integer bias Supporting this in the current framework + // would require repacking the accumulator vectors + continue; + } + bool IsInFilter = CoopVecHelpers::IsMatrixLayoutInFilter( L"CoopVecMatrixLayout", Config.MatrixLayout); if (!IsInFilter) { @@ -12250,9 +12347,9 @@ void ExecutionTest::runCoopVecMulSubtest( LogCommentFmt( L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: " - L"%d, NumLevels: %d, Bias: %s, MatrixLayout: %s", + L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s", Config.InputPerThread, Config.OutputPerThread, Config.NumThreads, - Config.NumLevels, Config.Bias ? L"true" : L"false", + Config.NumLayers, Config.Bias ? L"true" : L"false", CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str()); const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4); @@ -12261,8 +12358,8 @@ void ExecutionTest::runCoopVecMulSubtest( CComPtr RootSignature; { CD3DX12_DESCRIPTOR_RANGE Ranges[2]; - Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 3, 0, - 0); // InputVector, InputMatrix, InputBias + Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0, + 0); // InputVector, InputBias, InputMatrices[] Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer CreateRootSignatureFromRanges(D3DDevice, &RootSignature, Ranges, 2, nullptr, 0); @@ -12273,7 +12370,7 @@ void ExecutionTest::runCoopVecMulSubtest( { D3D12_DESCRIPTOR_HEAP_DESC Desc = {}; Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; - Desc.NumDescriptors = 4; + Desc.NumDescriptors = 3 + Config.NumLayers; Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; VERIFY_SUCCEEDED( D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap))); @@ -12281,106 +12378,35 @@ void ExecutionTest::runCoopVecMulSubtest( CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle( DescriptorHeap->GetCPUDescriptorHandleForHeapStart()); - // Setup input data - auto ExpectedOutputBuffer = - std::make_unique(Config.OutputPerThread * Config.NumThreads); - - std::vector InputMatrix; - if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) { - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // Matrix source data is fp32, which gets converted to fp16 during matrix - // conversion - InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix( - Config.InputPerThread, Config.OutputPerThread); - } else { - WEX::Logging::Log::Error(L"Unsupported matrix data type"); - return; - } + // Our input matrix is really a set of row vectors, which we can represent + // as a TestVector. + std::vector<::CoopVecHelpers::TestVector> InputMatrices; + for (int I = 0; I < Config.NumLayers - 1; ++I) { + // Each layer except the last is InputPerThread x InputPerThread + InputMatrices.push_back( + ::CoopVecHelpers::TestVector::createAllOnesTestMatrix( + Config.InputPerThread, Config.InputPerThread, + MulProps.MatrixInterpretation)); + } + // Last layer, matrix size is OutputPerThread x InputPerThread + InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix( + Config.OutputPerThread, Config.InputPerThread, + MulProps.MatrixInterpretation)); auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector( Config.NumThreads, Config.InputPerThread, MulProps.InputType, MulProps.InputInterpretation); auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector( - 1, Config.OutputPerThread, MulProps.BiasInterpretation, - MulProps.BiasInterpretation); + 1, std::max(Config.OutputPerThread, Config.InputPerThread), + MulProps.BiasInterpretation, MulProps.BiasInterpretation); // Calculate reference output - // FIXME: This does not capture all cases, but is sufficient for the preview - // feature set - if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - int32_t *InputBiasI32 = InputBias.getVector(0); - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - int Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - int InputElem; - if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputElem = (int)InputVector.getVector(ThreadIdx)[InputIdx]; - } else { - InputElem = InputVector.getVector(ThreadIdx)[InputIdx]; - } - int const MatrixElem = - InputMatrix[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += InputBiasI32[OutputIdx]; - } - - float Result = float(Acc); - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } - } else if (MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 || - MulProps.MatrixInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { - // The CPU reference matrix is float - std::vector InputMatrixFP32(InputMatrix.size() / sizeof(float)); - std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size()); - - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { - DirectX::PackedVector::HALF *InputVectorFP16 = - InputVector.getVector(ThreadIdx); - DirectX::PackedVector::HALF *InputBiasFP16 = - InputBias.getVector(0); - for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) { - float Acc = 0; - - for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) { - float const InputElem = - ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]); - float const MatrixElem = - InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx]; - Acc += InputElem * MatrixElem; - } - - if (Config.Bias) { - Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); - } - - float Result = Acc; - ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] = - Result; - } - } + auto ExpectedOutput = InputVector; + for (int I = 0; I < Config.NumLayers; ++I) { + ExpectedOutput = ::CoopVecHelpers::TestVector::matrixVectorMultiply( + InputMatrices[I], ExpectedOutput, InputBias, Config.Bias, + MulProps.MatrixInterpretation, + I == 0 ? MulProps.InputType : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32); } // Create the compute pipeline state for the CoopVec shader @@ -12391,7 +12417,7 @@ void ExecutionTest::runCoopVecMulSubtest( ByteAddressBuffer InputVector : register(t0); ByteAddressBuffer InputBias : register(t1); -ByteAddressBuffer InputMatrix : register(t2); +ByteAddressBuffer InputMatrix[NUM_LAYERS] : register(t2); RWByteAddressBuffer OutputBuffer: register(u0); [shader("compute")] @@ -12402,25 +12428,57 @@ void main(uint threadIdx : SV_GroupThreadID) uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE); vector input = InputVector.Load >(inputOffset); + VectorRef biasVec = { InputBias, 0 }; + + vector output; +)"; + + if (Config.NumLayers == 1) { + ShaderSource += R"( + MatrixRef mat = { InputMatrix[0], 0, STRIDE }; + + if (USE_BIAS) { + output = MulAdd(mat, MakeInterpretedVector(input), biasVec); + } else { + output = Mul(mat, MakeInterpretedVector(input)); + } +)"; + } else if (Config.NumLayers == 2) { + ShaderSource += R"( + vector accum; - MatrixRef mat = { InputMatrix, 0, STRIDE }; + MatrixRef mat0 = { InputMatrix[0], 0, STRIDE }; + if (USE_BIAS) { + accum = MulAdd(mat0, MakeInterpretedVector(input), biasVec); + //accum = Mul(mat0, MakeInterpretedVector(input)); + } else { + accum = Mul(mat0, MakeInterpretedVector(input)); + } - vector accum; + // Dummy activation function; all of our intermediates are positive (currently). + accum = max(accum, 0); + MatrixRef mat1 = { InputMatrix[1], 0, STRIDE }; if (USE_BIAS) { - VectorRef biasVec = { InputBias, 0 }; - accum = MulAdd(mat, MakeInterpretedVector(input), biasVec); + output = MulAdd(mat1, MakeInterpretedVector(accum), biasVec); } else { - accum = Mul(mat, MakeInterpretedVector(input)); + output = Mul(mat1, MakeInterpretedVector(accum)); } +)"; + } - vector result = (vector)accum; + ShaderSource += R"( + vector result = (vector)output; // Ensure 4-byte alignment for vector store uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float); OutputBuffer.Store >(outputOffset, result); } - )"; +)"; + +#if 0 + printf("%s\n", ShaderSource.c_str()); +#endif auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { std::wstringstream Stream; @@ -12462,7 +12520,7 @@ void main(uint threadIdx : SV_GroupThreadID) const std::wstring InputInterpretationEnum = CoopVecHelpers::GetHlslInterpretationForDataType( MulProps.InputInterpretation); - const std::wstring AccumInterpretationEnum = + const std::wstring BiasInterpretationEnum = CoopVecHelpers::GetHlslInterpretationForDataType( MulProps.BiasInterpretation); @@ -12487,10 +12545,15 @@ void main(uint threadIdx : SV_GroupThreadID) auto MatrixDataTypeEnumDefine = CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum); auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0); + // Treat the accumulator interpretation the same as the input interpretation + // for the purposes of MakeInterpretedVector. auto AccumInterpretationEnumDefine = CreateDefineFromString( - L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum); + L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum); auto InputVectorStrideDefine = CreateDefineFromInt( L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride()); + auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers); + auto BiasInterpretationEnumDefine = CreateDefineFromString( + L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum); LPCWSTR Options[] = { L"-enable-16bit-types", @@ -12507,8 +12570,18 @@ void main(uint threadIdx : SV_GroupThreadID) UseBiasDefine.c_str(), AccumInterpretationEnumDefine.c_str(), InputVectorStrideDefine.c_str(), + NumLayersDefine.c_str(), + BiasInterpretationEnumDefine.c_str(), }; +#if 0 + // Print options for debugging + WEX::Logging::Log::Comment(L"Shader compilation options:"); + for (UINT i = 0; i < _countof(Options); i++) { + WEX::Logging::Log::Comment(Options[i]); + } +#endif + CComPtr IncludeHandler = new LinAlgHeaderIncludeHandler(m_support); @@ -12531,14 +12604,17 @@ void main(uint threadIdx : SV_GroupThreadID) 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, IID_PPV_ARGS(&CommandList))); - // Setup input matrix as all-ones in sint8 format. This will later be - // converted to the appropriate data type by the matrix conversion API. - CComPtr InputMatrixSRVResource, InputMatrixSRVUploadResource; - - CreateTestResources(D3DDevice, CommandList, InputMatrix.data(), - InputMatrix.size(), - CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()), - &InputMatrixSRVResource, &InputMatrixSRVUploadResource); + std::vector> InputMatrixSRVResources( + Config.NumLayers); + std::vector> InputMatrixSRVUploadResources( + Config.NumLayers); + for (int I = 0; I < Config.NumLayers; ++I) { + CreateTestResources( + D3DDevice, CommandList, InputMatrices[I].getBuffer(), + InputMatrices[I].getTotalBytes(), + CD3DX12_RESOURCE_DESC::Buffer(InputMatrices[I].getTotalBytes()), + &InputMatrixSRVResources[I], &InputMatrixSRVUploadResources[I]); + } // Create input vector of an appropriate type. All integer types start as // SINT8 for now. @@ -12568,82 +12644,25 @@ void main(uint threadIdx : SV_GroupThreadID) (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)), InputBiasSRVResource); - CComPtr ConvertedMatrixResource; - { - // Create source matrix info - D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {}; - ConvertInfo.SrcInfo.SrcDataType = - CoopVecHelpers::GetMatrixSrcDataType(MulProps.MatrixInterpretation); - ConvertInfo.SrcInfo.SrcLayout = - D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; - - // Create destination matrix info - ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver - int SrcEltSize = 0; - int DestEltSize = 0; - switch (MulProps.MatrixInterpretation) { - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED: - ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8; - SrcEltSize = 1; - DestEltSize = 1; - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: - ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16; - SrcEltSize = 4; // FP32 - DestEltSize = 2; // FP16 - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: - ConvertInfo.DestInfo.DestDataType = - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3; - SrcEltSize = 4; // FP32 - DestEltSize = 1; // FP8 - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: - ConvertInfo.DestInfo.DestDataType = - D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2; - SrcEltSize = 4; // FP32 - DestEltSize = 1; // FP8 - break; - } - ConvertInfo.SrcInfo.SrcStride = Config.InputPerThread * SrcEltSize; - ConvertInfo.SrcInfo.SrcSize = - Config.InputPerThread * Config.OutputPerThread * SrcEltSize; - - ConvertInfo.DestInfo.DestLayout = Config.MatrixLayout; - ConvertInfo.DestInfo.DestStride = 0; - ConvertInfo.DestInfo.NumRows = Config.OutputPerThread; - ConvertInfo.DestInfo.NumColumns = Config.InputPerThread; - - if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { - ConvertInfo.DestInfo.DestStride = Config.InputPerThread * DestEltSize; - } else if (Config.MatrixLayout == - D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { - ConvertInfo.DestInfo.DestStride = Config.OutputPerThread * DestEltSize; - } - - // Get destination size using preview interface - { - CComPtr PreviewDevice; - VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview), - (void **)&PreviewDevice)); - - // Query required destination size - PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo( - &ConvertInfo.DestInfo); - } + // Create converted matrix resource and SRV for each input matrix + std::vector> ConvertedMatrixResources( + Config.NumLayers); + for (int I = 0; I < Config.NumLayers; ++I) { + auto ConvertInfo = InputMatrices[I].getConversionInfo( + D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout); int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16; // Create resource to hold matrix copy CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize, CD3DX12_RESOURCE_DESC::Buffer(SRVSize), - &ConvertedMatrixResource, nullptr); + &ConvertedMatrixResources[I], nullptr); // Set up data descriptors ConvertInfo.DataDesc.DestVA = - ConvertedMatrixResource->GetGPUVirtualAddress(); - ConvertInfo.DataDesc.SrcVA = InputMatrixSRVResource->GetGPUVirtualAddress(); + ConvertedMatrixResources[I]->GetGPUVirtualAddress(); + ConvertInfo.DataDesc.SrcVA = + InputMatrixSRVResources[I]->GetGPUVirtualAddress(); // Get command list interface and perform conversion CComPtr CommandList11; @@ -12651,8 +12670,9 @@ void main(uint threadIdx : SV_GroupThreadID) __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11)); CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); + // This increments BaseHandle CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t), - ConvertedMatrixResource); + ConvertedMatrixResources[I]); } CComPtr UavResource; @@ -12697,14 +12717,18 @@ void main(uint threadIdx : SV_GroupThreadID) float *ResultBuffer = (float *)MappedData.data(); bool Equal = true; - for (int i = 0; i < OutputBufferSize / sizeof(float); i++) { - if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) || - fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) { - LogErrorFmt(L"Result mismatch at index %d", i); - LogErrorFmt(L"ResultBuffer[%d]: %f, ExpectedOutputBuffer[%d]: %f", i, - ResultBuffer[i], i, ExpectedOutputBuffer[i]); - Equal = false; - break; + + for (int i = 0; i < Config.NumThreads; ++i) { + for (int j = 0; j < Config.OutputPerThread; ++j) { + float Result = ResultBuffer[i * Config.OutputPerThread + j]; + float Expected = ExpectedOutput.getVector(i)[j]; + if (isnan(Result) || isnan(Expected) || + fabs(Result - Expected) > 0.00001) { + LogErrorFmt(L"Result mismatch at index %d", + i * Config.OutputPerThread + j); + LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected); + Equal = false; + } } } VERIFY_IS_TRUE(Equal); From 03cf74d28a55b3472d2f20295f75c23ee54bcb8d Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 13:15:01 -0400 Subject: [PATCH 04/26] Remove dead code --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index a613f28139..3d69815034 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12476,10 +12476,6 @@ void main(uint threadIdx : SV_GroupThreadID) } )"; -#if 0 - printf("%s\n", ShaderSource.c_str()); -#endif - auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { std::wstringstream Stream; Stream << L"-D" << Name << L"=" << Value; @@ -12574,14 +12570,6 @@ void main(uint threadIdx : SV_GroupThreadID) BiasInterpretationEnumDefine.c_str(), }; -#if 0 - // Print options for debugging - WEX::Logging::Log::Comment(L"Shader compilation options:"); - for (UINT i = 0; i < _countof(Options); i++) { - WEX::Logging::Log::Comment(Options[i]); - } -#endif - CComPtr IncludeHandler = new LinAlgHeaderIncludeHandler(m_support); From 68069f0209097fa80ac370636387a8e13e61c973 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Thu, 8 May 2025 07:25:28 -0400 Subject: [PATCH 05/26] Remove dead line --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 3d69815034..ef769b12f7 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12450,7 +12450,6 @@ void main(uint threadIdx : SV_GroupThreadID) MatrixRef mat0 = { InputMatrix[0], 0, STRIDE }; if (USE_BIAS) { accum = MulAdd(mat0, MakeInterpretedVector(input), biasVec); - //accum = Mul(mat0, MakeInterpretedVector(input)); } else { accum = Mul(mat0, MakeInterpretedVector(input)); } From 33bcadf518fa3e22bb0ad1aa66bd787cdbf4daf5 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Thu, 8 May 2025 07:27:07 -0400 Subject: [PATCH 06/26] Add comment about ambiguous IsFP32 flag --- tools/clang/unittests/HLSLExec/CoopVec.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index b5c0a2f355..18b8669197 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -749,12 +749,13 @@ struct TestVector { const TestVector &Bias, bool HasBias, D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation, D3D12_LINEAR_ALGEBRA_DATATYPE InputType) { - bool IsFP32 = false; + // The CPU reference matrix is FP32 for all FP interpretations. + bool IsMatrixFP32 = false; switch (MatrixInterpretation) { case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: - IsFP32 = true; + IsMatrixFP32 = true; break; default: break; @@ -763,7 +764,7 @@ struct TestVector { TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(), sizeof(float)); - if (IsFP32) { + if (IsMatrixFP32) { for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { const DirectX::PackedVector::HALF *InputBiasFP16 = Bias.getVector(0); From 70d642e2e5806de791ea66e017eb31d3b87e62bd Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 15:45:54 -0400 Subject: [PATCH 07/26] Initial support for CoopVec pixel shader tests --- .../unittests/HLSLExec/ExecutionTest.cpp | 342 ++++++++++++------ 1 file changed, 228 insertions(+), 114 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index ef769b12f7..51206893e9 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -799,7 +799,7 @@ class ExecutionTest { D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps); void runCoopVecMulSubtest(ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps, - CoopVecMulSubtestConfig &Config); + CoopVecMulSubtestConfig &Config, bool RunCompute); struct CoopVecOuterProductSubtestConfig { int DimM; // Row Count @@ -815,6 +815,7 @@ class ExecutionTest { ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps, CoopVecOuterProductSubtestConfig &Config); + #endif // HAVE_COOPVEC_API template @@ -12337,13 +12338,15 @@ void ExecutionTest::runCoopVecMulTestConfig( continue; } - runCoopVecMulSubtest(D3DDevice, MulProps, Config); + // Run once as compute, then again as graphics (pixel shader) + runCoopVecMulSubtest(D3DDevice, MulProps, Config, true); + runCoopVecMulSubtest(D3DDevice, MulProps, Config, false); } } void ExecutionTest::runCoopVecMulSubtest( ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps, - CoopVecMulSubtestConfig &Config) { + CoopVecMulSubtestConfig &Config, bool RunCompute) { LogCommentFmt( L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: " @@ -12361,8 +12364,17 @@ void ExecutionTest::runCoopVecMulSubtest( Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0, 0); // InputVector, InputBias, InputMatrices[] Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer - CreateRootSignatureFromRanges(D3DDevice, &RootSignature, Ranges, 2, nullptr, - 0); + + CD3DX12_ROOT_PARAMETER RootParams[2]; + RootParams[0].InitAsDescriptorTable(_countof(Ranges), Ranges, + D3D12_SHADER_VISIBILITY_ALL); + RootParams[1].InitAsUnorderedAccessView(/* register */ 10, /* space */ 0, + D3D12_SHADER_VISIBILITY_ALL); + + CD3DX12_ROOT_SIGNATURE_DESC RootSignatureDesc; + RootSignatureDesc.Init(_countof(RootParams), RootParams, 0, nullptr, + D3D12_ROOT_SIGNATURE_FLAG_NONE); + CreateRootSignatureFromDesc(D3DDevice, &RootSignatureDesc, &RootSignature); } // Create descriptor heap with space for 4 descriptors: 3 SRVs and 1 UAV @@ -12411,8 +12423,8 @@ void ExecutionTest::runCoopVecMulSubtest( // Create the compute pipeline state for the CoopVec shader CComPtr ComputePipelineState; - { - std::string ShaderSource = R"( + + std::string ShaderSource = R"( #include "dx/linalg.h" ByteAddressBuffer InputVector : register(t0); @@ -12420,9 +12432,9 @@ ByteAddressBuffer InputBias : register(t1); ByteAddressBuffer InputMatrix[NUM_LAYERS] : register(t2); RWByteAddressBuffer OutputBuffer: register(u0); -[shader("compute")] -[numthreads(NUM_THREADS, 1, 1)] -void main(uint threadIdx : SV_GroupThreadID) +RWStructuredBuffer AtomicCounter : register(u10); + +void RunCoopVecTest(uint threadIdx) { using namespace dx::linalg; @@ -12433,8 +12445,8 @@ void main(uint threadIdx : SV_GroupThreadID) vector output; )"; - if (Config.NumLayers == 1) { - ShaderSource += R"( + if (Config.NumLayers == 1) { + ShaderSource += R"( MatrixRef mat = { InputMatrix[0], 0, STRIDE }; if (USE_BIAS) { @@ -12443,8 +12455,8 @@ void main(uint threadIdx : SV_GroupThreadID) output = Mul(mat, MakeInterpretedVector(input)); } )"; - } else if (Config.NumLayers == 2) { - ShaderSource += R"( + } else if (Config.NumLayers == 2) { + ShaderSource += R"( vector accum; MatrixRef mat0 = { InputMatrix[0], 0, STRIDE }; @@ -12464,117 +12476,168 @@ void main(uint threadIdx : SV_GroupThreadID) output = Mul(mat1, MakeInterpretedVector(accum)); } )"; - } + } - ShaderSource += R"( + ShaderSource += R"( vector result = (vector)output; // Ensure 4-byte alignment for vector store uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float); OutputBuffer.Store >(outputOffset, result); } -)"; - auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { - std::wstringstream Stream; - Stream << L"-D" << Name << L"=" << Value; - return Stream.str(); - }; +[shader("compute")] +[numthreads(NUM_THREADS, 1, 1)] +void main(uint threadIdx : SV_GroupThreadID) +{ + RunCoopVecTest(threadIdx); +} - auto CreateDefineFromString = [](const wchar_t *Name, - const std::wstring &Value) { - std::wstringstream Stream; - Stream << L"-D" << Name << L"=" << Value; - return Stream.str(); - }; +float4 vs_main(uint vid : SV_VertexID) : SV_Position { + switch (vid) { + case 0: + return float4(-1, 1, 0, 0); + case 1: + return float4(3, 1, 0, 0); + case 2: + return float4(-1, -3, 0, 0); + } + return float4(0, 0, 0, 0); +} - int Stride = 0; - const std::wstring HlslMatrixLayout = - CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout); - int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType( - MulProps.MatrixInterpretation); - switch (Config.MatrixLayout) { - case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR: - Stride = Config.InputPerThread * StrideMultiplier; - break; - case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR: - Stride = Config.OutputPerThread * StrideMultiplier; - break; - } +float4 ps_main() : SV_Target { + uint threadIdx; + InterlockedAdd(AtomicCounter[0], 1, threadIdx); + RunCoopVecTest(threadIdx); + return float4(1, 1, 1, 1); +} +)"; - const int InputDivisor = - CoopVecHelpers::GetNumPackedElementsForInputDataType( - MulProps.InputInterpretation); - const std::wstring InputDataType = - CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.InputType); - const std::wstring AccumDataType = - CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.BiasInterpretation); - const std::wstring MatrixDataTypeEnum = - CoopVecHelpers::GetHlslInterpretationForDataType( - MulProps.MatrixInterpretation); - const std::wstring InputInterpretationEnum = - CoopVecHelpers::GetHlslInterpretationForDataType( - MulProps.InputInterpretation); - const std::wstring BiasInterpretationEnum = - CoopVecHelpers::GetHlslInterpretationForDataType( - MulProps.BiasInterpretation); + auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { + std::wstringstream Stream; + Stream << L"-D" << Name << L"=" << Value; + return Stream.str(); + }; - auto InputPerThreadDefine = - CreateDefineFromInt(L"INPUT_PER_THREAD", Config.InputPerThread); - auto OutputPerThreadDefine = - CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread); - auto NumThreadsDefine = - CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads); - auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); - auto InputDataTypeDefine = - CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType); - auto InputDivisorDefine = CreateDefineFromInt( - L"INPUT_VECTOR_NUM_ELEMENTS", - (Config.InputPerThread + InputDivisor - 1) / InputDivisor); - auto AccumDataTypeDefine = - CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType); - auto InputInterpretationEnumDefine = CreateDefineFromString( - L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum); - auto HlslMatrixLayoutDefine = - CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout); - auto MatrixDataTypeEnumDefine = - CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum); - auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0); - // Treat the accumulator interpretation the same as the input interpretation - // for the purposes of MakeInterpretedVector. - auto AccumInterpretationEnumDefine = CreateDefineFromString( - L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum); - auto InputVectorStrideDefine = CreateDefineFromInt( - L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride()); - auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers); - auto BiasInterpretationEnumDefine = CreateDefineFromString( - L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum); + auto CreateDefineFromString = [](const wchar_t *Name, + const std::wstring &Value) { + std::wstringstream Stream; + Stream << L"-D" << Name << L"=" << Value; + return Stream.str(); + }; - LPCWSTR Options[] = { - L"-enable-16bit-types", - InputPerThreadDefine.c_str(), - OutputPerThreadDefine.c_str(), - NumThreadsDefine.c_str(), - StrideDefine.c_str(), - InputDataTypeDefine.c_str(), - InputDivisorDefine.c_str(), - AccumDataTypeDefine.c_str(), - InputInterpretationEnumDefine.c_str(), - HlslMatrixLayoutDefine.c_str(), - MatrixDataTypeEnumDefine.c_str(), - UseBiasDefine.c_str(), - AccumInterpretationEnumDefine.c_str(), - InputVectorStrideDefine.c_str(), - NumLayersDefine.c_str(), - BiasInterpretationEnumDefine.c_str(), - }; + int Stride = 0; + const std::wstring HlslMatrixLayout = + CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout); + int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType( + MulProps.MatrixInterpretation); + switch (Config.MatrixLayout) { + case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR: + Stride = Config.InputPerThread * StrideMultiplier; + break; + case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR: + Stride = Config.OutputPerThread * StrideMultiplier; + break; + } - CComPtr IncludeHandler = - new LinAlgHeaderIncludeHandler(m_support); + const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType( + MulProps.InputInterpretation); + const std::wstring InputDataType = + CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.InputType); + const std::wstring AccumDataType = + CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.BiasInterpretation); + const std::wstring MatrixDataTypeEnum = + CoopVecHelpers::GetHlslInterpretationForDataType( + MulProps.MatrixInterpretation); + const std::wstring InputInterpretationEnum = + CoopVecHelpers::GetHlslInterpretationForDataType( + MulProps.InputInterpretation); + const std::wstring BiasInterpretationEnum = + CoopVecHelpers::GetHlslInterpretationForDataType( + MulProps.BiasInterpretation); + + auto InputPerThreadDefine = + CreateDefineFromInt(L"INPUT_PER_THREAD", Config.InputPerThread); + auto OutputPerThreadDefine = + CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread); + auto NumThreadsDefine = + CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads); + auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); + auto InputDataTypeDefine = + CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType); + auto InputDivisorDefine = CreateDefineFromInt( + L"INPUT_VECTOR_NUM_ELEMENTS", + (Config.InputPerThread + InputDivisor - 1) / InputDivisor); + auto AccumDataTypeDefine = + CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType); + auto InputInterpretationEnumDefine = CreateDefineFromString( + L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum); + auto HlslMatrixLayoutDefine = + CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout); + auto MatrixDataTypeEnumDefine = + CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum); + auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0); + // Treat the accumulator interpretation the same as the input interpretation + // for the purposes of MakeInterpretedVector. + auto AccumInterpretationEnumDefine = CreateDefineFromString( + L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum); + auto InputVectorStrideDefine = + CreateDefineFromInt(L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride()); + auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers); + auto BiasInterpretationEnumDefine = CreateDefineFromString( + L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum); + + LPCWSTR Options[] = { + L"-enable-16bit-types", + InputPerThreadDefine.c_str(), + OutputPerThreadDefine.c_str(), + NumThreadsDefine.c_str(), + StrideDefine.c_str(), + InputDataTypeDefine.c_str(), + InputDivisorDefine.c_str(), + AccumDataTypeDefine.c_str(), + InputInterpretationEnumDefine.c_str(), + HlslMatrixLayoutDefine.c_str(), + MatrixDataTypeEnumDefine.c_str(), + UseBiasDefine.c_str(), + AccumInterpretationEnumDefine.c_str(), + InputVectorStrideDefine.c_str(), + NumLayersDefine.c_str(), + BiasInterpretationEnumDefine.c_str(), + }; + CComPtr IncludeHandler = + new LinAlgHeaderIncludeHandler(m_support); + + if (RunCompute) { CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9", &ComputePipelineState, Options, _countof(Options), IncludeHandler); + } else { + CComPtr VertexShader; + CComPtr PixelShader; + + CompileFromText(ShaderSource.c_str(), L"vs_main", L"vs_6_9", &VertexShader, + Options, _countof(Options), IncludeHandler); + CompileFromText(ShaderSource.c_str(), L"ps_main", L"ps_6_9", &PixelShader, + Options, _countof(Options), IncludeHandler); + + D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {}; + // psoDesc.InputLayout; + PsoDesc.pRootSignature = RootSignature; + PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader); + PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader); + PsoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); + PsoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); + PsoDesc.DepthStencilState.DepthEnable = FALSE; + PsoDesc.DepthStencilState.StencilEnable = FALSE; + PsoDesc.SampleMask = UINT_MAX; + PsoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + PsoDesc.NumRenderTargets = 1; + PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; + PsoDesc.SampleDesc.Count = 1; + VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState( + &PsoDesc, IID_PPV_ARGS(&ComputePipelineState))); } // Create a command list for the compute shader. @@ -12662,6 +12725,14 @@ void main(uint threadIdx : SV_GroupThreadID) ConvertedMatrixResources[I]); } + // Create resource for atomic counter + CComPtr AtomicCounterResource; + uint32_t AtomicCounterInit = 0; + CreateTestResources(D3DDevice, CommandList, &AtomicCounterInit, + sizeof(AtomicCounterInit), + CD3DX12_RESOURCE_DESC::Buffer(sizeof(AtomicCounterInit)), + &AtomicCounterResource, nullptr); + CComPtr UavResource; CComPtr UavUploadResource; CComPtr UavReadResource; @@ -12687,10 +12758,54 @@ void main(uint threadIdx : SV_GroupThreadID) CD3DX12_GPU_DESCRIPTOR_HANDLE ResHandle( DescriptorHeap->GetGPUDescriptorHandleForHeapStart()); - CommandList->SetComputeRootSignature(RootSignature); - CommandList->SetComputeRootDescriptorTable(0, ResHandle); - CommandList->SetPipelineState(ComputePipelineState); - CommandList->Dispatch(1, 1, 1); + CComPtr RtvHeap; + CComPtr RenderTarget; + CComPtr RenderTargetRead; + + if (RunCompute) { + CommandList->SetComputeRootSignature(RootSignature); + CommandList->SetComputeRootDescriptorTable(0, ResHandle); + CommandList->SetPipelineState(ComputePipelineState); + CommandList->Dispatch(1, 1, 1); + } else { + UINT FrameCount = 1; + UINT RtvDescSize = 0; + CreateRtvDescriptorHeap(D3DDevice, FrameCount, &RtvHeap, &RtvDescSize); + CreateRenderTargetAndReadback(D3DDevice, RtvHeap, 100, 100, &RenderTarget, + &RenderTargetRead); + + D3D12_RESOURCE_DESC RtDesc = RenderTarget->GetDesc(); + D3D12_VIEWPORT Viewport; + D3D12_RECT ScissorRect; + + memset(&Viewport, 0, sizeof(Viewport)); + Viewport.Height = (float)RtDesc.Height; + Viewport.Width = (float)RtDesc.Width; + Viewport.MaxDepth = 1.0f; + memset(&ScissorRect, 0, sizeof(ScissorRect)); + ScissorRect.right = (long)RtDesc.Width; + ScissorRect.bottom = RtDesc.Height; + CommandList->SetGraphicsRootSignature(RootSignature); + CommandList->SetGraphicsRootDescriptorTable(0, ResHandle); + CommandList->SetGraphicsRootUnorderedAccessView( + 1, AtomicCounterResource->GetGPUVirtualAddress()); + CommandList->RSSetViewports(1, &Viewport); + CommandList->RSSetScissorRects(1, &ScissorRect); + + // Indicate that the buffer will be used as a render target. + RecordTransitionBarrier(CommandList, RenderTarget, + D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_RENDER_TARGET); + + CD3DX12_CPU_DESCRIPTOR_HANDLE RtvHandle( + RtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, RtvDescSize); + CommandList->OMSetRenderTargets(1, &RtvHandle, FALSE, nullptr); + + CommandList->ClearRenderTargetView(RtvHandle, ClearColor, 0, nullptr); + CommandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST); + CommandList->DrawInstanced(3, 1, 0, 0); + } + RecordTransitionBarrier(CommandList, UavResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE); @@ -12713,7 +12828,8 @@ void main(uint threadIdx : SV_GroupThreadID) fabs(Result - Expected) > 0.00001) { LogErrorFmt(L"Result mismatch at index %d", i * Config.OutputPerThread + j); - LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected); + LogErrorFmt(L"Result: %f, Expected: %f (stage: %s)", Result, + Expected, RunCompute ? L"compute" : L"pixel"); Equal = false; } } @@ -12923,7 +13039,6 @@ RWByteAddressBuffer AccumMatrix : register(u0); [numthreads(NUM_THREADS, 1, 1)] void main(uint threadIdx : SV_GroupThreadID) { -#if 1 using namespace dx::linalg; // Ensure 4-byte alignment for vector loads @@ -12936,7 +13051,6 @@ void main(uint threadIdx : SV_GroupThreadID) RWMatrixRef mat = { AccumMatrix, 0, STRIDE }; OuterProductAccumulate(input1, input2, mat); -#endif } )"; From ac28b864d8398c64413551fcc605d19bfc4ba8cf Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 15:56:57 -0400 Subject: [PATCH 08/26] Support pixel shaders in OuterProduct tests --- .../unittests/HLSLExec/ExecutionTest.cpp | 310 ++++++++++++------ 1 file changed, 211 insertions(+), 99 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 51206893e9..191899457f 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -814,7 +814,7 @@ class ExecutionTest { void runCoopVecOuterProductSubtest( ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps, - CoopVecOuterProductSubtestConfig &Config); + CoopVecOuterProductSubtestConfig &Config, bool RunCompute); #endif // HAVE_COOPVEC_API @@ -12913,29 +12913,41 @@ void ExecutionTest::runCoopVecOuterProductTestConfig( continue; } - runCoopVecOuterProductSubtest(D3DDevice, AccumulateProps, Config); + // Run once in compute, then once in graphics (pixel shader) + runCoopVecOuterProductSubtest(D3DDevice, AccumulateProps, Config, true); + runCoopVecOuterProductSubtest(D3DDevice, AccumulateProps, Config, false); } } void ExecutionTest::runCoopVecOuterProductSubtest( ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps, - CoopVecOuterProductSubtestConfig &Config) { + CoopVecOuterProductSubtestConfig &Config, bool RunCompute) { LogCommentFmt( - L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s", + L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s, " + L"Stage: %s", Config.DimM, Config.DimN, Config.NumThreads, - CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str()); + CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(), + RunCompute ? L"Compute" : L"Pixel"); // Create root signature with a single root entry for all SRVs and UAVs CComPtr RootSignature; { - CD3DX12_DESCRIPTOR_RANGE ranges[2]; - ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 0, - 0); // InputVector1, InputVector2 - ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // AccumMatrix - CreateRootSignatureFromRanges(D3DDevice, &RootSignature, ranges, 2, nullptr, - 0); + CD3DX12_DESCRIPTOR_RANGE Ranges[2]; + Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 0, 0); + Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); + + CD3DX12_ROOT_PARAMETER RootParams[2]; + RootParams[0].InitAsDescriptorTable(_countof(Ranges), Ranges, + D3D12_SHADER_VISIBILITY_ALL); + RootParams[1].InitAsUnorderedAccessView(/* register */ 10, /* space */ 0, + D3D12_SHADER_VISIBILITY_ALL); + + CD3DX12_ROOT_SIGNATURE_DESC RootSignatureDesc; + RootSignatureDesc.Init(_countof(RootParams), RootParams, 0, nullptr, + D3D12_ROOT_SIGNATURE_FLAG_NONE); + CreateRootSignatureFromDesc(D3DDevice, &RootSignatureDesc, &RootSignature); } // Create descriptor heap with space for 3 descriptors: 2 SRVs and 1 UAV @@ -13027,17 +13039,17 @@ void ExecutionTest::runCoopVecOuterProductSubtest( // Create a compute pipeline state object. CComPtr ComputePipelineState; - { - std::string ShaderSource = R"( + + std::string ShaderSource = R"( #include "dx/linalg.h" ByteAddressBuffer InputVector1 : register(t0); ByteAddressBuffer InputVector2 : register(t1); RWByteAddressBuffer AccumMatrix : register(u0); -[shader("compute")] -[numthreads(NUM_THREADS, 1, 1)] -void main(uint threadIdx : SV_GroupThreadID) +RWStructuredBuffer AtomicCounter : register(u10); + +void RunCoopVecTest(uint threadIdx) { using namespace dx::linalg; @@ -13052,94 +13064,142 @@ void main(uint threadIdx : SV_GroupThreadID) OuterProductAccumulate(input1, input2, mat); } - )"; - auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { - std::wstringstream Stream; - Stream << L"-D" << Name << L"=" << Value; - return Stream.str(); - }; +[shader("compute")] +[numthreads(NUM_THREADS, 1, 1)] +void main(uint threadIdx : SV_GroupThreadID) +{ + RunCoopVecTest(threadIdx); +} - auto CreateDefineFromString = [](const wchar_t *Name, - const wchar_t *Value) { - std::wstringstream Stream; - Stream << L"-D" << Name << L"=" << Value; - return Stream.str(); - }; +float4 vs_main(uint vid : SV_VertexID) : SV_Position { + switch (vid) { + case 0: + return float4(-1, 1, 0, 0); + case 1: + return float4(3, 1, 0, 0); + case 2: + return float4(-1, -3, 0, 0); + } + return float4(0, 0, 0, 0); +} - int Stride = 0; - const std::wstring HlslMatrixLayout = - CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout); - int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType( - AccumulateProps.AccumulationType); - switch (Config.MatrixLayout) { - case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR: - Stride = Config.DimN * StrideMultiplier; - break; - case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR: - Stride = Config.DimM * StrideMultiplier; - break; - } +float4 ps_main() : SV_Target { + uint threadIdx; + InterlockedAdd(AtomicCounter[0], 1, threadIdx); + RunCoopVecTest(threadIdx); + return float4(1, 1, 1, 1); +} +)"; - const int InputDivisor = - CoopVecHelpers::GetNumPackedElementsForInputDataType( - AccumulateProps.InputType); - const std::wstring InputDataType = - CoopVecHelpers::GetHlslDataTypeForDataType(AccumulateProps.InputType); - const std::wstring AccumDataType = - CoopVecHelpers::GetHlslDataTypeForDataType( - AccumulateProps.AccumulationType); - const std::wstring MatrixDataTypeEnum = - CoopVecHelpers::GetHlslInterpretationForDataType( - AccumulateProps.AccumulationType); - const std::wstring InputInterpretationEnum = - CoopVecHelpers::GetHlslInterpretationForDataType( - AccumulateProps.InputType); - - auto DimMDefine = CreateDefineFromInt(L"DIM_M", Config.DimM); - auto DimNDefine = CreateDefineFromInt(L"DIM_N", Config.DimN); - auto NumThreadsDefine = - CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads); - auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); - auto InputDataTypeDefine = - CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType.c_str()); - auto InputDivisorDefine = - CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor); - auto AccumDataTypeDefine = - CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType.c_str()); - auto InputInterpretationEnumDefine = CreateDefineFromString( - L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum.c_str()); - auto HlslMatrixLayoutDefine = - CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str()); - auto MatrixDataTypeEnumDefine = CreateDefineFromString( - L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str()); - auto InputVector1StrideDefine = CreateDefineFromInt( - L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride()); - auto InputVector2StrideDefine = CreateDefineFromInt( - L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride()); - - LPCWSTR Options[] = { - L"-enable-16bit-types", - DimMDefine.c_str(), - DimNDefine.c_str(), - NumThreadsDefine.c_str(), - StrideDefine.c_str(), - InputDataTypeDefine.c_str(), - InputDivisorDefine.c_str(), - AccumDataTypeDefine.c_str(), - InputInterpretationEnumDefine.c_str(), - HlslMatrixLayoutDefine.c_str(), - MatrixDataTypeEnumDefine.c_str(), - InputVector1StrideDefine.c_str(), - InputVector2StrideDefine.c_str(), - }; + auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { + std::wstringstream Stream; + Stream << L"-D" << Name << L"=" << Value; + return Stream.str(); + }; + + auto CreateDefineFromString = [](const wchar_t *Name, const wchar_t *Value) { + std::wstringstream Stream; + Stream << L"-D" << Name << L"=" << Value; + return Stream.str(); + }; + + int Stride = 0; + const std::wstring HlslMatrixLayout = + CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout); + int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType( + AccumulateProps.AccumulationType); + switch (Config.MatrixLayout) { + case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR: + Stride = Config.DimN * StrideMultiplier; + break; + case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR: + Stride = Config.DimM * StrideMultiplier; + break; + } - CComPtr IncludeHandler = - new LinAlgHeaderIncludeHandler(m_support); + const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType( + AccumulateProps.InputType); + const std::wstring InputDataType = + CoopVecHelpers::GetHlslDataTypeForDataType(AccumulateProps.InputType); + const std::wstring AccumDataType = CoopVecHelpers::GetHlslDataTypeForDataType( + AccumulateProps.AccumulationType); + const std::wstring MatrixDataTypeEnum = + CoopVecHelpers::GetHlslInterpretationForDataType( + AccumulateProps.AccumulationType); + const std::wstring InputInterpretationEnum = + CoopVecHelpers::GetHlslInterpretationForDataType( + AccumulateProps.InputType); + auto DimMDefine = CreateDefineFromInt(L"DIM_M", Config.DimM); + auto DimNDefine = CreateDefineFromInt(L"DIM_N", Config.DimN); + auto NumThreadsDefine = + CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads); + auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); + auto InputDataTypeDefine = + CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType.c_str()); + auto InputDivisorDefine = CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor); + auto AccumDataTypeDefine = + CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType.c_str()); + auto InputInterpretationEnumDefine = CreateDefineFromString( + L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum.c_str()); + auto HlslMatrixLayoutDefine = + CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str()); + auto MatrixDataTypeEnumDefine = CreateDefineFromString( + L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str()); + auto InputVector1StrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride()); + auto InputVector2StrideDefine = CreateDefineFromInt( + L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride()); + + LPCWSTR Options[] = { + L"-enable-16bit-types", + DimMDefine.c_str(), + DimNDefine.c_str(), + NumThreadsDefine.c_str(), + StrideDefine.c_str(), + InputDataTypeDefine.c_str(), + InputDivisorDefine.c_str(), + AccumDataTypeDefine.c_str(), + InputInterpretationEnumDefine.c_str(), + HlslMatrixLayoutDefine.c_str(), + MatrixDataTypeEnumDefine.c_str(), + InputVector1StrideDefine.c_str(), + InputVector2StrideDefine.c_str(), + }; + + CComPtr IncludeHandler = + new LinAlgHeaderIncludeHandler(m_support); + + if (RunCompute) { CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9", &ComputePipelineState, Options, _countof(Options), IncludeHandler); + } else { + CComPtr VertexShader; + CComPtr PixelShader; + + CompileFromText(ShaderSource.c_str(), L"vs_main", L"vs_6_9", &VertexShader, + Options, _countof(Options), IncludeHandler); + CompileFromText(ShaderSource.c_str(), L"ps_main", L"ps_6_9", &PixelShader, + Options, _countof(Options), IncludeHandler); + + D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {}; + // psoDesc.InputLayout; + PsoDesc.pRootSignature = RootSignature; + PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader); + PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader); + PsoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT); + PsoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT); + PsoDesc.DepthStencilState.DepthEnable = FALSE; + PsoDesc.DepthStencilState.StencilEnable = FALSE; + PsoDesc.SampleMask = UINT_MAX; + PsoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE; + PsoDesc.NumRenderTargets = 1; + PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; + PsoDesc.SampleDesc.Count = 1; + VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState( + &PsoDesc, IID_PPV_ARGS(&ComputePipelineState))); } // Create a command list for the compute shader. @@ -13282,6 +13342,14 @@ void main(uint threadIdx : SV_GroupThreadID) ConvertedMatrixResource); } + // Create resource for atomic counter + CComPtr AtomicCounterResource; + uint32_t AtomicCounterInit = 0; + CreateTestResources(D3DDevice, CommandList, &AtomicCounterInit, + sizeof(AtomicCounterInit), + CD3DX12_RESOURCE_DESC::Buffer(sizeof(AtomicCounterInit)), + &AtomicCounterResource, nullptr); + CommandList->Close(); ExecuteCommandList(CommandQueue, CommandList); WaitForSignal(CommandQueue, FO); @@ -13293,10 +13361,54 @@ void main(uint threadIdx : SV_GroupThreadID) CD3DX12_GPU_DESCRIPTOR_HANDLE ResHandle( DescriptorHeap->GetGPUDescriptorHandleForHeapStart()); - CommandList->SetComputeRootSignature(RootSignature); - CommandList->SetComputeRootDescriptorTable(0, ResHandle); - CommandList->SetPipelineState(ComputePipelineState); - CommandList->Dispatch(1, 1, 1); + CComPtr RtvHeap; + CComPtr RenderTarget; + CComPtr RenderTargetRead; + + if (RunCompute) { + CommandList->SetComputeRootSignature(RootSignature); + CommandList->SetComputeRootDescriptorTable(0, ResHandle); + CommandList->SetPipelineState(ComputePipelineState); + CommandList->Dispatch(1, 1, 1); + } else { + UINT FrameCount = 1; + UINT RtvDescSize = 0; + CreateRtvDescriptorHeap(D3DDevice, FrameCount, &RtvHeap, &RtvDescSize); + CreateRenderTargetAndReadback(D3DDevice, RtvHeap, 100, 100, &RenderTarget, + &RenderTargetRead); + + D3D12_RESOURCE_DESC RtDesc = RenderTarget->GetDesc(); + D3D12_VIEWPORT Viewport; + D3D12_RECT ScissorRect; + + memset(&Viewport, 0, sizeof(Viewport)); + Viewport.Height = (float)RtDesc.Height; + Viewport.Width = (float)RtDesc.Width; + Viewport.MaxDepth = 1.0f; + memset(&ScissorRect, 0, sizeof(ScissorRect)); + ScissorRect.right = (long)RtDesc.Width; + ScissorRect.bottom = RtDesc.Height; + CommandList->SetGraphicsRootSignature(RootSignature); + CommandList->SetGraphicsRootDescriptorTable(0, ResHandle); + CommandList->SetGraphicsRootUnorderedAccessView( + 1, AtomicCounterResource->GetGPUVirtualAddress()); + CommandList->RSSetViewports(1, &Viewport); + CommandList->RSSetScissorRects(1, &ScissorRect); + + // Indicate that the buffer will be used as a render target. + RecordTransitionBarrier(CommandList, RenderTarget, + D3D12_RESOURCE_STATE_COPY_DEST, + D3D12_RESOURCE_STATE_RENDER_TARGET); + + CD3DX12_CPU_DESCRIPTOR_HANDLE RtvHandle( + RtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, RtvDescSize); + CommandList->OMSetRenderTargets(1, &RtvHandle, FALSE, nullptr); + + CommandList->ClearRenderTargetView(RtvHandle, ClearColor, 0, nullptr); + CommandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST); + CommandList->DrawInstanced(3, 1, 0, 0); + } + CommandList->Close(); ExecuteCommandList(CommandQueue, CommandList); WaitForSignal(CommandQueue, FO); From 07a32cbbd6091e36df2b472b4f585711c4e48f35 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 15:59:05 -0400 Subject: [PATCH 09/26] logging fix --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 191899457f..123cf2a8ef 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12350,10 +12350,11 @@ void ExecutionTest::runCoopVecMulSubtest( LogCommentFmt( L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: " - L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s", + L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s, Stage: %s", Config.InputPerThread, Config.OutputPerThread, Config.NumThreads, Config.NumLayers, Config.Bias ? L"true" : L"false", - CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str()); + CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(), + RunCompute ? L"Compute" : L"Pixel"); const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4); @@ -12828,8 +12829,7 @@ float4 ps_main() : SV_Target { fabs(Result - Expected) > 0.00001) { LogErrorFmt(L"Result mismatch at index %d", i * Config.OutputPerThread + j); - LogErrorFmt(L"Result: %f, Expected: %f (stage: %s)", Result, - Expected, RunCompute ? L"compute" : L"pixel"); + LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected); Equal = false; } } From a2731a3ab04229baca908a52e374f67b60c1b5d1 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 16:01:45 -0400 Subject: [PATCH 10/26] s/ComputePipelineState/PipelineState/ --- .../unittests/HLSLExec/ExecutionTest.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 123cf2a8ef..903365914f 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12423,7 +12423,7 @@ void ExecutionTest::runCoopVecMulSubtest( } // Create the compute pipeline state for the CoopVec shader - CComPtr ComputePipelineState; + CComPtr PipelineState; std::string ShaderSource = R"( #include "dx/linalg.h" @@ -12612,7 +12612,7 @@ float4 ps_main() : SV_Target { if (RunCompute) { CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9", - &ComputePipelineState, Options, _countof(Options), + &PipelineState, Options, _countof(Options), IncludeHandler); } else { CComPtr VertexShader; @@ -12638,7 +12638,7 @@ float4 ps_main() : SV_Target { PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; PsoDesc.SampleDesc.Count = 1; VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState( - &PsoDesc, IID_PPV_ARGS(&ComputePipelineState))); + &PsoDesc, IID_PPV_ARGS(&PipelineState))); } // Create a command list for the compute shader. @@ -12652,7 +12652,7 @@ float4 ps_main() : SV_Target { VERIFY_SUCCEEDED(D3DDevice->CreateCommandAllocator( D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&CommandAllocator))); VERIFY_SUCCEEDED(D3DDevice->CreateCommandList( - 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, + 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, PipelineState, IID_PPV_ARGS(&CommandList))); std::vector> InputMatrixSRVResources( @@ -12752,7 +12752,7 @@ float4 ps_main() : SV_Target { ExecuteCommandList(CommandQueue, CommandList); WaitForSignal(CommandQueue, FO); VERIFY_SUCCEEDED(CommandAllocator->Reset()); - VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, ComputePipelineState)); + VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, PipelineState)); SetDescriptorHeap(CommandList, DescriptorHeap); @@ -12766,7 +12766,7 @@ float4 ps_main() : SV_Target { if (RunCompute) { CommandList->SetComputeRootSignature(RootSignature); CommandList->SetComputeRootDescriptorTable(0, ResHandle); - CommandList->SetPipelineState(ComputePipelineState); + CommandList->SetPipelineState(PipelineState); CommandList->Dispatch(1, 1, 1); } else { UINT FrameCount = 1; @@ -13038,7 +13038,7 @@ void ExecutionTest::runCoopVecOuterProductSubtest( } // Create a compute pipeline state object. - CComPtr ComputePipelineState; + CComPtr PipelineState; std::string ShaderSource = R"( #include "dx/linalg.h" @@ -13173,7 +13173,7 @@ float4 ps_main() : SV_Target { if (RunCompute) { CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9", - &ComputePipelineState, Options, _countof(Options), + &PipelineState, Options, _countof(Options), IncludeHandler); } else { CComPtr VertexShader; @@ -13199,7 +13199,7 @@ float4 ps_main() : SV_Target { PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM; PsoDesc.SampleDesc.Count = 1; VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState( - &PsoDesc, IID_PPV_ARGS(&ComputePipelineState))); + &PsoDesc, IID_PPV_ARGS(&PipelineState))); } // Create a command list for the compute shader. @@ -13213,7 +13213,7 @@ float4 ps_main() : SV_Target { VERIFY_SUCCEEDED(D3DDevice->CreateCommandAllocator( D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&CommandAllocator))); VERIFY_SUCCEEDED(D3DDevice->CreateCommandList( - 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState, + 0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, PipelineState, IID_PPV_ARGS(&CommandList))); CComPtr InputMatrixSRVResource, InputMatrixSRVUploadResource; @@ -13354,7 +13354,7 @@ float4 ps_main() : SV_Target { ExecuteCommandList(CommandQueue, CommandList); WaitForSignal(CommandQueue, FO); VERIFY_SUCCEEDED(CommandAllocator->Reset()); - VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, ComputePipelineState)); + VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, PipelineState)); SetDescriptorHeap(CommandList, DescriptorHeap); @@ -13368,7 +13368,7 @@ float4 ps_main() : SV_Target { if (RunCompute) { CommandList->SetComputeRootSignature(RootSignature); CommandList->SetComputeRootDescriptorTable(0, ResHandle); - CommandList->SetPipelineState(ComputePipelineState); + CommandList->SetPipelineState(PipelineState); CommandList->Dispatch(1, 1, 1); } else { UINT FrameCount = 1; @@ -13414,7 +13414,7 @@ float4 ps_main() : SV_Target { WaitForSignal(CommandQueue, FO); VERIFY_SUCCEEDED(CommandAllocator->Reset()); - VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, ComputePipelineState)); + VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, PipelineState)); // Convert matrix to sint8/fp32 row-major format before reading back to the // CPU. A new resource is created, along with a readback resource, for the From f5bfc88d5414db30084a22dcc3f42f1009a9e6fc Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 16:05:35 -0400 Subject: [PATCH 11/26] Add some more sizes to OuterProduct test --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 903365914f..19f7f660a5 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12899,7 +12899,14 @@ void ExecutionTest::runCoopVecOuterProductTestConfig( .c_str()); constexpr CoopVecOuterProductSubtestConfig TestConfigs[] = { - {4, 4, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {4, 4, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {4, 4, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {16, 16, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {16, 16, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {32, 32, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {32, 32, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {64, 64, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, + {64, 64, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL}, }; for (auto Config : TestConfigs) { From ad20ee604943bc45bd8d0e024559864264107916 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Wed, 7 May 2025 16:16:27 -0400 Subject: [PATCH 12/26] pixel shader bounds checks --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 19f7f660a5..bab6ab917b 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12509,6 +12509,13 @@ float4 vs_main(uint vid : SV_VertexID) : SV_Position { float4 ps_main() : SV_Target { uint threadIdx; InterlockedAdd(AtomicCounter[0], 1, threadIdx); + // threadIdx may exceed NUM_THREADS, but bounds checking on the vector + // loads/stores will prevent any faults from occurring. This lets us + // exercise the CoopVec implementation on more threads, giving us + // further confidence that there are no bad interactions between "good" + // threads and threads that fail bounds checking and operate on all-zero + // input data. This also gives us some additional testing of long vector + // bounds-checking. RunCoopVecTest(threadIdx); return float4(1, 1, 1, 1); } @@ -13094,7 +13101,8 @@ float4 vs_main(uint vid : SV_VertexID) : SV_Position { float4 ps_main() : SV_Target { uint threadIdx; InterlockedAdd(AtomicCounter[0], 1, threadIdx); - RunCoopVecTest(threadIdx); + if (threadIdx < NUM_THREADS) + RunCoopVecTest(threadIdx); return float4(1, 1, 1, 1); } )"; From e70a45fca04bfc06d05506315633601ffaea635c Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Thu, 8 May 2025 19:38:44 -0400 Subject: [PATCH 13/26] Implement loading/storing input/output vectors through groupshared memory and improved input vector/matrix test patterns --- tools/clang/unittests/HLSLExec/CoopVec.h | 90 ++++++++---- .../unittests/HLSLExec/ExecutionTest.cpp | 132 +++++++++++------- 2 files changed, 146 insertions(+), 76 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index 18b8669197..bbcc5b8b96 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -6,6 +6,7 @@ #include #include +#include #include #include "dxc/Support/microcom.h" @@ -358,6 +359,15 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) { } } +bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { + return DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 || + DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16 || + DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16 || + DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 || + DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32; +} + struct TestVector { private: size_t NumVectors = 0; @@ -534,31 +544,51 @@ struct TestVector { } } - template void fillSimpleTestData() { - // Create a vector of (1, 1, 0, ...) + template + void fillSimpleTestData(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation, + std::mt19937 &Rnd) { for (size_t I = 0; I < NumVectors; ++I) { T *Vec = getVector(I); for (size_t J = 0; J < VectorSize; ++J) - if constexpr (std::is_same_v) { - // Special case for HALF, which requires conversion from float - Vec[J] = static_cast( - ConvertFloat32ToFloat16((J == 0 || J == 1) ? 1.0f : 0.0f)); + if constexpr (std::is_same_v || + std::is_same_v) { + float Elt = 0.0f; + if (IsIntegralDataType(MatrixInterpretation)) { + Elt = (float)(Rnd() & 0x7) - 3.0f; + } else { + Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f; + } + if constexpr (std::is_same_v) { + Vec[J] = static_cast(ConvertFloat32ToFloat16(Elt)); + } else { + Vec[J] = static_cast(Elt); + } } else { - Vec[J] = static_cast((J == 0 || J == 1) ? 1 : 0); + if constexpr (std::is_signed_v) { + Vec[J] = static_cast((int32_t)(Rnd() & 0xf) - 8); + } else { + Vec[J] = static_cast((uint32_t)(Rnd() & 0xf)); + } } } } - template void fillAllOnesTestData() { - // Create a vector of (1, 1, 1, ...) + template void FillSimpleMatrixTestData(std::mt19937 &Rnd) { for (size_t I = 0; I < NumVectors; ++I) { T *Vec = getVector(I); for (size_t J = 0; J < VectorSize; ++J) if constexpr (std::is_same_v) { - // Special case for HALF, which requires conversion from float - Vec[J] = static_cast(ConvertFloat32ToFloat16(1.0f)); + float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f; + Vec[J] = static_cast(ConvertFloat32ToFloat16(Elt)); + } else if constexpr (std::is_same_v) { + float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f; + Vec[J] = static_cast(Elt); } else { - Vec[J] = static_cast(1); + if constexpr (std::is_signed_v) { + Vec[J] = static_cast((int32_t)(Rnd() & 0xf) - 8); + } else { + Vec[J] = static_cast((uint32_t)(Rnd() & 0xf)); + } } } } @@ -566,7 +596,9 @@ struct TestVector { static TestVector createSimpleTestVector(size_t NumVectors, size_t VectorSize, D3D12_LINEAR_ALGEBRA_DATATYPE DataType, - D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation, + D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation, + std::mt19937 &Rnd) { size_t ElementSize; switch (DataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: @@ -600,35 +632,36 @@ struct TestVector { TestVector Vec(NumVectors, VectorSize, ElementSize); switch (DataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) { - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); } else { - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); } break; case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, + Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: - Vec.fillSimpleTestData(); + Vec.fillSimpleTestData(MatrixInterpretation, Rnd); break; default: throw std::invalid_argument("Unsupported data type"); @@ -638,7 +671,8 @@ struct TestVector { static TestVector createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize, - D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation, + std::mt19937 &Rnd) { size_t ElementSize; switch (DataInterpretation) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: @@ -666,13 +700,13 @@ struct TestVector { case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: - Vec.fillAllOnesTestData(); + Vec.FillSimpleMatrixTestData(Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: - Vec.fillAllOnesTestData(); + Vec.FillSimpleMatrixTestData(Rnd); break; default: throw std::invalid_argument("Unsupported data type"); @@ -724,10 +758,12 @@ struct TestVector { ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize(); if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { - ConvertInfo.DestInfo.DestStride = (UINT)getVectorSize() * DestEltSize; + ConvertInfo.DestInfo.DestStride = + ((UINT)getVectorSize() * DestEltSize + 15) & ~15; } else if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { - ConvertInfo.DestInfo.DestStride = (UINT)getNumVectors() * DestEltSize; + ConvertInfo.DestInfo.DestStride = + ((UINT)getNumVectors() * DestEltSize + 15) & ~15; } // Get destination size using preview interface diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index bab6ab917b..a58137a63f 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12348,6 +12348,8 @@ void ExecutionTest::runCoopVecMulSubtest( ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps, CoopVecMulSubtestConfig &Config, bool RunCompute) { + std::mt19937 Rnd(0x42); + LogCommentFmt( L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: " L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s, Stage: %s", @@ -12399,19 +12401,20 @@ void ExecutionTest::runCoopVecMulSubtest( InputMatrices.push_back( ::CoopVecHelpers::TestVector::createAllOnesTestMatrix( Config.InputPerThread, Config.InputPerThread, - MulProps.MatrixInterpretation)); + MulProps.MatrixInterpretation, Rnd)); } // Last layer, matrix size is OutputPerThread x InputPerThread InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix( Config.OutputPerThread, Config.InputPerThread, - MulProps.MatrixInterpretation)); + MulProps.MatrixInterpretation, Rnd)); auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector( Config.NumThreads, Config.InputPerThread, MulProps.InputType, - MulProps.InputInterpretation); + MulProps.InputInterpretation, MulProps.MatrixInterpretation, Rnd); auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector( 1, std::max(Config.OutputPerThread, Config.InputPerThread), - MulProps.BiasInterpretation, MulProps.BiasInterpretation); + MulProps.BiasInterpretation, MulProps.BiasInterpretation, + MulProps.MatrixInterpretation, Rnd); // Calculate reference output auto ExpectedOutput = InputVector; @@ -12435,20 +12438,32 @@ RWByteAddressBuffer OutputBuffer: register(u0); RWStructuredBuffer AtomicCounter : register(u10); +#if USE_GROUPSHARED +groupshared vector inputGS[NUM_THREADS]; +groupshared vector outputGS[NUM_THREADS]; +#endif + void RunCoopVecTest(uint threadIdx) { using namespace dx::linalg; uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE); vector input = InputVector.Load >(inputOffset); - VectorRef biasVec = { InputBias, 0 }; +#if USE_GROUPSHARED + // Use groupshared memory to grab the "next" thread's input vector. + inputGS[threadIdx] = input; + GroupMemoryBarrierWithGroupSync(); + input = inputGS[(threadIdx + 1) % NUM_THREADS]; +#endif + + VectorRef biasVec = { InputBias, 0 }; vector output; )"; if (Config.NumLayers == 1) { ShaderSource += R"( - MatrixRef mat = { InputMatrix[0], 0, STRIDE }; + MatrixRef mat = { InputMatrix[0], 0, STRIDE0 }; if (USE_BIAS) { output = MulAdd(mat, MakeInterpretedVector(input), biasVec); @@ -12460,17 +12475,17 @@ void RunCoopVecTest(uint threadIdx) ShaderSource += R"( vector accum; - MatrixRef mat0 = { InputMatrix[0], 0, STRIDE }; + MatrixRef mat0 = { InputMatrix[0], 0, STRIDE0 }; if (USE_BIAS) { accum = MulAdd(mat0, MakeInterpretedVector(input), biasVec); } else { accum = Mul(mat0, MakeInterpretedVector(input)); } - // Dummy activation function; all of our intermediates are positive (currently). - accum = max(accum, 0); + // Dummy activation function; all of our intermediates above -10000 + accum = max(accum, -10000); - MatrixRef mat1 = { InputMatrix[1], 0, STRIDE }; + MatrixRef mat1 = { InputMatrix[1], 0, STRIDE1 }; if (USE_BIAS) { output = MulAdd(mat1, MakeInterpretedVector(accum), biasVec); } else { @@ -12482,6 +12497,13 @@ void RunCoopVecTest(uint threadIdx) ShaderSource += R"( vector result = (vector)output; +#if USE_GROUPSHARED + // Use groupshared memory to grab the "previous" thread's output vector. + outputGS[threadIdx] = result; + GroupMemoryBarrierWithGroupSync(); + result = outputGS[(threadIdx + NUM_THREADS - 1) % NUM_THREADS]; +#endif + // Ensure 4-byte alignment for vector store uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float); OutputBuffer.Store >(outputOffset, result); @@ -12534,20 +12556,8 @@ float4 ps_main() : SV_Target { return Stream.str(); }; - int Stride = 0; const std::wstring HlslMatrixLayout = CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout); - int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType( - MulProps.MatrixInterpretation); - switch (Config.MatrixLayout) { - case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR: - Stride = Config.InputPerThread * StrideMultiplier; - break; - case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR: - Stride = Config.OutputPerThread * StrideMultiplier; - break; - } - const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType( MulProps.InputInterpretation); const std::wstring InputDataType = @@ -12570,7 +12580,6 @@ float4 ps_main() : SV_Target { CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread); auto NumThreadsDefine = CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads); - auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); auto InputDataTypeDefine = CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType); auto InputDivisorDefine = CreateDefineFromInt( @@ -12594,13 +12603,14 @@ float4 ps_main() : SV_Target { auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers); auto BiasInterpretationEnumDefine = CreateDefineFromString( L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum); + auto UseGroupsharedDefine = + CreateDefineFromInt(L"USE_GROUPSHARED", RunCompute ? 1 : 0); - LPCWSTR Options[] = { + std::vector Options = { L"-enable-16bit-types", InputPerThreadDefine.c_str(), OutputPerThreadDefine.c_str(), NumThreadsDefine.c_str(), - StrideDefine.c_str(), InputDataTypeDefine.c_str(), InputDivisorDefine.c_str(), AccumDataTypeDefine.c_str(), @@ -12612,23 +12622,35 @@ float4 ps_main() : SV_Target { InputVectorStrideDefine.c_str(), NumLayersDefine.c_str(), BiasInterpretationEnumDefine.c_str(), + UseGroupsharedDefine.c_str(), }; + std::vector StrideDefines; + for (int I = 0; I < Config.NumLayers; ++I) { + auto ConvertInfo = InputMatrices[I].getConversionInfo( + D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout); + wchar_t StrideName[16]; + swprintf(StrideName, _countof(StrideName), L"STRIDE%d", I); + StrideDefines.push_back( + CreateDefineFromInt(StrideName, ConvertInfo.DestInfo.DestStride)); + Options.push_back(StrideDefines[I].c_str()); + } + CComPtr IncludeHandler = new LinAlgHeaderIncludeHandler(m_support); if (RunCompute) { CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9", - &PipelineState, Options, _countof(Options), + &PipelineState, Options.data(), (int)Options.size(), IncludeHandler); } else { CComPtr VertexShader; CComPtr PixelShader; CompileFromText(ShaderSource.c_str(), L"vs_main", L"vs_6_9", &VertexShader, - Options, _countof(Options), IncludeHandler); + Options.data(), (int)Options.size(), IncludeHandler); CompileFromText(ShaderSource.c_str(), L"ps_main", L"ps_6_9", &PixelShader, - Options, _countof(Options), IncludeHandler); + Options.data(), (int)Options.size(), IncludeHandler); D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {}; // psoDesc.InputLayout; @@ -12828,16 +12850,30 @@ float4 ps_main() : SV_Target { float *ResultBuffer = (float *)MappedData.data(); bool Equal = true; - for (int i = 0; i < Config.NumThreads; ++i) { + float MaxError = 0.00001f; + if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { + // Allow for more error in fp16 relative to the fp32 reference + MaxError = 0.1f; + } else if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3) { + // And even more error for the fp8 formats + MaxError = 1.0f; + } else if (MulProps.MatrixInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { + MaxError = 3.0f; + } + + for (int i = 0; i < Config.NumThreads && Equal; ++i) { for (int j = 0; j < Config.OutputPerThread; ++j) { float Result = ResultBuffer[i * Config.OutputPerThread + j]; float Expected = ExpectedOutput.getVector(i)[j]; if (isnan(Result) || isnan(Expected) || - fabs(Result - Expected) > 0.00001) { - LogErrorFmt(L"Result mismatch at index %d", - i * Config.OutputPerThread + j); + fabs(Result - Expected) > MaxError) { + LogErrorFmt(L"Result mismatch at vector %d, element %d", i, j); LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected); Equal = false; + break; } } } @@ -12938,6 +12974,8 @@ void ExecutionTest::runCoopVecOuterProductSubtest( D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps, CoopVecOuterProductSubtestConfig &Config, bool RunCompute) { + std::mt19937 Rnd(0x42); + LogCommentFmt( L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s, " L"Stage: %s", @@ -12996,17 +13034,17 @@ void ExecutionTest::runCoopVecOuterProductSubtest( InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix(Config.DimN, Config.DimM); } else { - WEX::Logging::Log::Error(L"Unsupported matrix data type"); + WEX::Logging::Log::Comment(L"Unsupported matrix data type"); return; } // Create input vectors auto InputVector1 = CoopVecHelpers::TestVector::createSimpleTestVector( Config.NumThreads, Config.DimM, AccumulateProps.InputType, - AccumulateProps.InputType); + AccumulateProps.InputType, AccumulateProps.AccumulationType, Rnd); auto InputVector2 = CoopVecHelpers::TestVector::createSimpleTestVector( Config.NumThreads, Config.DimN, AccumulateProps.InputType, - AccumulateProps.InputType); + AccumulateProps.InputType, AccumulateProps.AccumulationType, Rnd); // Calculate reference output auto ExpectedOutputBufferI8 = @@ -13017,14 +13055,11 @@ void ExecutionTest::runCoopVecOuterProductSubtest( ExpectedOutputBufferI8.size()); if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { - DirectX::PackedVector::HALF *InputVector1FP16 = - reinterpret_cast( - InputVector1.getBuffer()); - DirectX::PackedVector::HALF *InputVector2FP16 = - reinterpret_cast( - InputVector2.getBuffer()); - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + auto *InputVector1FP16 = + InputVector1.getVector(ThreadIdx); + auto *InputVector2FP16 = + InputVector2.getVector(ThreadIdx); for (int M = 0; M < Config.DimM; ++M) { for (int N = 0; N < Config.DimN; ++N) { float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) * @@ -13035,20 +13070,19 @@ void ExecutionTest::runCoopVecOuterProductSubtest( } } else if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - float *InputVector1FP32 = - reinterpret_cast(InputVector1.getBuffer()); - float *InputVector2FP32 = - reinterpret_cast(InputVector2.getBuffer()); - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + auto *InputVector1FP32 = InputVector1.getVector(ThreadIdx); + auto *InputVector2FP32 = InputVector2.getVector(ThreadIdx); for (int M = 0; M < Config.DimM; ++M) { for (int N = 0; N < Config.DimN; ++N) { - float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] * - InputVector2FP32[ThreadIdx * Config.DimN + N]; + float Acc = InputVector1FP32[M] * InputVector2FP32[N]; ExpectedOutputBuffer[M * Config.DimN + N] += Acc; } } } + } else { + WEX::Logging::Log::Comment(L"Unsupported input data type"); + return; } // Create a compute pipeline state object. From 00b0385458ad86a5d2cea16f601cf62074df83a1 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 10:23:17 -0400 Subject: [PATCH 14/26] Update CoopVecAPI.h with latest Agility SDK preview d3d12.h --- tools/clang/unittests/HLSLExec/CoopVecAPI.h | 14 ++++++-------- .../clang/unittests/HLSLExec/ExecutionTest.cpp | 18 +++++++++--------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVecAPI.h b/tools/clang/unittests/HLSLExec/CoopVecAPI.h index 16c1105edc..563366e0bc 100644 --- a/tools/clang/unittests/HLSLExec/CoopVecAPI.h +++ b/tools/clang/unittests/HLSLExec/CoopVecAPI.h @@ -145,18 +145,16 @@ ID3D12DevicePreview : public IUnknown #endif /* __ID3D12DevicePreview_INTERFACE_DEFINED__ */ -#ifndef __ID3D12GraphicsCommandList11_INTERFACE_DEFINED__ -#define __ID3D12GraphicsCommandList11_INTERFACE_DEFINED__ +#ifndef __ID3D12GraphicsCommandListPreview_INTERFACE_DEFINED__ +#define __ID3D12GraphicsCommandListPreview_INTERFACE_DEFINED__ -EXTERN_C const IID IID_ID3D12GraphicsCommandList11; +EXTERN_C const IID IID_ID3D12GraphicsCommandListPreview; -MIDL_INTERFACE("f0dcfabc-a84a-4fe3-b3b9-eab26b306c38") -ID3D12GraphicsCommandList11 : public ID3D12GraphicsCommandList10 +MIDL_INTERFACE("536d9bb6-9eee-4c75-86e8-e29e29e08ed3") +ID3D12GraphicsCommandListPreview : public ID3D12GraphicsCommandList10 { public: virtual void STDMETHODCALLTYPE Reserved0() = 0; - virtual void STDMETHODCALLTYPE Reserved1() = 0; - virtual void STDMETHODCALLTYPE Reserved2() = 0; virtual void STDMETHODCALLTYPE ConvertLinearAlgebraMatrix( _In_ const D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO *pDesc, @@ -164,7 +162,7 @@ ID3D12GraphicsCommandList11 : public ID3D12GraphicsCommandList10 }; -#endif /* __ID3D12GraphicsCommandList11_INTERFACE_DEFINED__ */ +#endif /* __ID3D12GraphicsCommandListPreview_INTERFACE_DEFINED__ */ #else // __ID3D12GraphicsCommandList10_INTERFACE_DEFINED__ // The used d3d12.h header does not support ID3D12GraphicsCommandList10, diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index a58137a63f..f369d38ae8 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12745,10 +12745,10 @@ float4 ps_main() : SV_Target { InputMatrixSRVResources[I]->GetGPUVirtualAddress(); // Get command list interface and perform conversion - CComPtr CommandList11; + CComPtr CommandListPreview; VERIFY_SUCCEEDED(CommandList->QueryInterface( - __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11)); - CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); + __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview)); + CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); // This increments BaseHandle CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t), @@ -13376,10 +13376,10 @@ float4 ps_main() : SV_Target { ConvertInfo.DataDesc = DataDesc; // Get command list interface and perform conversion - CComPtr CommandList11; + CComPtr CommandListPreview; VERIFY_SUCCEEDED(CommandList->QueryInterface( - __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11)); - CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); + __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview)); + CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); // This increments baseHandle if ((ConvertInfo.DestInfo.DestSize % 4) != 0) { @@ -13527,10 +13527,10 @@ float4 ps_main() : SV_Target { ConvertedMatrixResource->GetGPUVirtualAddress(); // Get command list interface and perform conversion - CComPtr CommandList11; + CComPtr CommandListPreview; VERIFY_SUCCEEDED(CommandList->QueryInterface( - __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11)); - CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); + __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview)); + CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); } RecordTransitionBarrier(CommandList, MatrixRowMajorResource, From e21d92d82569697b7afd1d38560c20a56a7f236d Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 12:04:50 -0400 Subject: [PATCH 15/26] clang-format --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index f369d38ae8..a06de31508 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12746,8 +12746,9 @@ float4 ps_main() : SV_Target { // Get command list interface and perform conversion CComPtr CommandListPreview; - VERIFY_SUCCEEDED(CommandList->QueryInterface( - __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview)); + VERIFY_SUCCEEDED( + CommandList->QueryInterface(__uuidof(ID3D12GraphicsCommandListPreview), + (void **)&CommandListPreview)); CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); // This increments BaseHandle @@ -13377,8 +13378,9 @@ float4 ps_main() : SV_Target { // Get command list interface and perform conversion CComPtr CommandListPreview; - VERIFY_SUCCEEDED(CommandList->QueryInterface( - __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview)); + VERIFY_SUCCEEDED( + CommandList->QueryInterface(__uuidof(ID3D12GraphicsCommandListPreview), + (void **)&CommandListPreview)); CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); // This increments baseHandle @@ -13528,8 +13530,9 @@ float4 ps_main() : SV_Target { // Get command list interface and perform conversion CComPtr CommandListPreview; - VERIFY_SUCCEEDED(CommandList->QueryInterface( - __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview)); + VERIFY_SUCCEEDED( + CommandList->QueryInterface(__uuidof(ID3D12GraphicsCommandListPreview), + (void **)&CommandListPreview)); CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1); } From 20b6ad76e6ced4ed7d7808b870022962fccc14f0 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 14:53:38 -0400 Subject: [PATCH 16/26] Fix w-pos in vertex shaders --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index a06de31508..18e27f849d 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12519,11 +12519,11 @@ void main(uint threadIdx : SV_GroupThreadID) float4 vs_main(uint vid : SV_VertexID) : SV_Position { switch (vid) { case 0: - return float4(-1, 1, 0, 0); + return float4(-1, 1, 0, 1); case 1: - return float4(3, 1, 0, 0); + return float4(3, 1, 0, 1); case 2: - return float4(-1, -3, 0, 0); + return float4(-1, -3, 0, 1); } return float4(0, 0, 0, 0); } @@ -13124,11 +13124,11 @@ void main(uint threadIdx : SV_GroupThreadID) float4 vs_main(uint vid : SV_VertexID) : SV_Position { switch (vid) { case 0: - return float4(-1, 1, 0, 0); + return float4(-1, 1, 0, 1); case 1: - return float4(3, 1, 0, 0); + return float4(3, 1, 0, 1); case 2: - return float4(-1, -3, 0, 0); + return float4(-1, -3, 0, 1); } return float4(0, 0, 0, 0); } From 6f8a4e21fa195753f082ba3d62ffd29c0484b0bf Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 14:57:36 -0400 Subject: [PATCH 17/26] Re-enable debug layer --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 18e27f849d..27159375ff 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -758,7 +758,7 @@ class ExecutionTest { #endif } - bool UseDebugIfaces() { return false; } + bool UseDebugIfaces() { return true; } bool SaveImages() { return GetTestParamBool(L"SaveImages"); } From 9d171ed20354ff261af69f7b33b95cd53a98e9d6 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 15:15:56 -0400 Subject: [PATCH 18/26] style nits --- tools/clang/unittests/HLSLExec/CoopVec.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index bbcc5b8b96..a354c0b4cb 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -34,9 +34,10 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler { L"LinAlgHeader", ParamValue))) { return E_FAIL; } - if (ParamValue.IsEmpty()) { + + if (ParamValue.IsEmpty()) return E_FAIL; - } + LPCWSTR RealHeaderPath = reinterpret_cast(ParamValue.GetBuffer()); @@ -382,15 +383,12 @@ struct TestVector { size_t Alignment = 16) : NumVectors(NumVectors), VectorSize(VectorSize), ElementSize(ElementSize) { - if (NumVectors == 0) { + if (NumVectors == 0) throw std::invalid_argument("NumVectors must be greater than 0"); - } - if (VectorSize == 0) { + if (VectorSize == 0) throw std::invalid_argument("VectorSize must be greater than 0"); - } - if (ElementSize == 0) { + if (ElementSize == 0) throw std::invalid_argument("ElementSize must be greater than 0"); - } size_t VectorBytes = VectorSize * ElementSize; Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment; @@ -488,9 +486,8 @@ struct TestVector { Buffer = reinterpret_cast(Ptr); // Copy data - if (other.Buffer) { + if (other.Buffer) std::memcpy(Buffer, other.Buffer, TotalBytes); - } } return *this; } From b7bc46b5ea518974a8ea356853ca946575f8ae16 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 15:51:56 -0400 Subject: [PATCH 19/26] Fix up some uses of int and uint32_t that should be size_t --- tools/clang/unittests/HLSLExec/CoopVec.h | 32 ++-- .../unittests/HLSLExec/ExecutionTest.cpp | 142 +++++++++--------- 2 files changed, 90 insertions(+), 84 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index a354c0b4cb..689a4f214f 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -67,10 +67,10 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler { namespace CoopVecHelpers { template -static std::vector CreateAllOnesInputMatrix(uint32_t Width, - uint32_t Height) { +static std::vector CreateAllOnesInputMatrix(size_t Width, + size_t Height) { std::vector InputMatrix(Width * Height); - for (uint32_t i = 0; i < Width * Height; i++) { + for (size_t i = 0; i < Width * Height; i++) { if constexpr (std::is_same_v || std::is_same_v) { InputMatrix[i] = 1; @@ -92,15 +92,15 @@ static std::vector CreateAllOnesInputMatrix(uint32_t Width, } template -static std::vector CreateInputVector(uint32_t NumThreads, - uint32_t EltsPerThread) { +static std::vector CreateInputVector(size_t NumThreads, + size_t EltsPerThread) { std::vector InputVector(NumThreads * EltsPerThread); std::fill(InputVector.begin(), InputVector.end(), EltTy(0)); if (EltsPerThread < 2) { WEX::Logging::Log::Error(L"EltsPerThread must be at least 2"); return std::vector(); } - for (uint32_t TID = 0; TID < NumThreads; TID++) { + for (size_t TID = 0; TID < NumThreads; TID++) { if constexpr (std::is_same_v || std::is_same_v) { InputVector[TID * EltsPerThread + 0] = 1; @@ -125,7 +125,7 @@ static std::vector CreateInputVector(uint32_t NumThreads, } template -static std::vector CreateInputBias(uint32_t NumElts) { +static std::vector CreateInputBias(size_t NumElts) { std::vector InputBias(NumElts); if constexpr (std::is_same_v || std::is_same_v) { @@ -248,7 +248,7 @@ static std::wstring MatrixLayoutToHlslLayoutString( // This multiplier is used to compute the row/column stride for a matrix // given it's element size. -static int +static size_t GetStrideMultiplierForMatrixDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { switch (DataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED: @@ -271,7 +271,7 @@ GetStrideMultiplierForMatrixDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { } } -static int GetNumPackedElementsForInputDataType( +static size_t GetNumPackedElementsForInputDataType( D3D12_LINEAR_ALGEBRA_DATATYPE InputInterpretation) { // Int8 packed types are the only ones that have more than 1 element per // shader variable @@ -724,7 +724,7 @@ struct TestVector { // Create destination matrix info ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver - int DestEltSize = 0; + UINT DestEltSize = 0; switch (DestDataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED: @@ -798,14 +798,14 @@ struct TestVector { sizeof(float)); if (IsMatrixFP32) { - for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { + for (size_t VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { const DirectX::PackedVector::HALF *InputBiasFP16 = Bias.getVector(0); - for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); + for (size_t OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); ++OutputIdx) { float Acc = 0; - for (int InputIdx = 0; InputIdx < Matrix.getVectorSize(); + for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize(); ++InputIdx) { float InputElem; if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { @@ -829,13 +829,13 @@ struct TestVector { } } } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { - for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { + for (size_t VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) { const int32_t *InputBiasI32 = Bias.getVector(0); - for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); + for (size_t OutputIdx = 0; OutputIdx < Matrix.getNumVectors(); ++OutputIdx) { int Acc = 0; - for (int InputIdx = 0; InputIdx < Matrix.getVectorSize(); + for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize(); ++InputIdx) { int InputElem; if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index 27159375ff..afdde90029 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -786,10 +786,10 @@ class ExecutionTest { #if HAVE_COOPVEC_API struct CoopVecMulSubtestConfig { - int InputPerThread; - int OutputPerThread; - int NumThreads; - int NumLayers; + size_t InputPerThread; + size_t OutputPerThread; + size_t NumThreads; + size_t NumLayers; D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout; bool Bias; }; @@ -802,9 +802,9 @@ class ExecutionTest { CoopVecMulSubtestConfig &Config, bool RunCompute); struct CoopVecOuterProductSubtestConfig { - int DimM; // Row Count - int DimN; // Column Count - int NumThreads; + size_t DimM; // Row Count + size_t DimN; // Column Count + size_t NumThreads; D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout; }; @@ -12358,13 +12358,15 @@ void ExecutionTest::runCoopVecMulSubtest( CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(), RunCompute ? L"Compute" : L"Pixel"); - const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4); + const size_t OutputBufferSize = + (Config.OutputPerThread * Config.NumThreads * 4); // Create root signature with a single root entry for all SRVs and UAVs CComPtr RootSignature; { CD3DX12_DESCRIPTOR_RANGE Ranges[2]; - Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0, + Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + (UINT)Config.NumLayers, + 0, 0); // InputVector, InputBias, InputMatrices[] Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer @@ -12385,7 +12387,7 @@ void ExecutionTest::runCoopVecMulSubtest( { D3D12_DESCRIPTOR_HEAP_DESC Desc = {}; Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; - Desc.NumDescriptors = 3 + Config.NumLayers; + Desc.NumDescriptors = 3 + (UINT)Config.NumLayers; Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; VERIFY_SUCCEEDED( D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap))); @@ -12396,7 +12398,7 @@ void ExecutionTest::runCoopVecMulSubtest( // Our input matrix is really a set of row vectors, which we can represent // as a TestVector. std::vector<::CoopVecHelpers::TestVector> InputMatrices; - for (int I = 0; I < Config.NumLayers - 1; ++I) { + for (size_t I = 0; I < Config.NumLayers - 1; ++I) { // Each layer except the last is InputPerThread x InputPerThread InputMatrices.push_back( ::CoopVecHelpers::TestVector::createAllOnesTestMatrix( @@ -12418,7 +12420,7 @@ void ExecutionTest::runCoopVecMulSubtest( // Calculate reference output auto ExpectedOutput = InputVector; - for (int I = 0; I < Config.NumLayers; ++I) { + for (size_t I = 0; I < Config.NumLayers; ++I) { ExpectedOutput = ::CoopVecHelpers::TestVector::matrixVectorMultiply( InputMatrices[I], ExpectedOutput, InputBias, Config.Bias, MulProps.MatrixInterpretation, @@ -12543,7 +12545,7 @@ float4 ps_main() : SV_Target { } )"; - auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { + auto CreateDefineFromSize = [](const wchar_t *Name, size_t Value) { std::wstringstream Stream; Stream << L"-D" << Name << L"=" << Value; return Stream.str(); @@ -12558,8 +12560,9 @@ float4 ps_main() : SV_Target { const std::wstring HlslMatrixLayout = CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout); - const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType( - MulProps.InputInterpretation); + const size_t InputDivisor = + CoopVecHelpers::GetNumPackedElementsForInputDataType( + MulProps.InputInterpretation); const std::wstring InputDataType = CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.InputType); const std::wstring AccumDataType = @@ -12575,14 +12578,14 @@ float4 ps_main() : SV_Target { MulProps.BiasInterpretation); auto InputPerThreadDefine = - CreateDefineFromInt(L"INPUT_PER_THREAD", Config.InputPerThread); + CreateDefineFromSize(L"INPUT_PER_THREAD", Config.InputPerThread); auto OutputPerThreadDefine = - CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread); + CreateDefineFromSize(L"OUTPUT_PER_THREAD", Config.OutputPerThread); auto NumThreadsDefine = - CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads); + CreateDefineFromSize(L"NUM_THREADS", Config.NumThreads); auto InputDataTypeDefine = CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType); - auto InputDivisorDefine = CreateDefineFromInt( + auto InputDivisorDefine = CreateDefineFromSize( L"INPUT_VECTOR_NUM_ELEMENTS", (Config.InputPerThread + InputDivisor - 1) / InputDivisor); auto AccumDataTypeDefine = @@ -12593,18 +12596,18 @@ float4 ps_main() : SV_Target { CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout); auto MatrixDataTypeEnumDefine = CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum); - auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0); + auto UseBiasDefine = CreateDefineFromSize(L"USE_BIAS", Config.Bias ? 1 : 0); // Treat the accumulator interpretation the same as the input interpretation // for the purposes of MakeInterpretedVector. auto AccumInterpretationEnumDefine = CreateDefineFromString( L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum); auto InputVectorStrideDefine = - CreateDefineFromInt(L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride()); - auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers); + CreateDefineFromSize(L"INPUT_VECTOR_STRIDE", InputVector.getStride()); + auto NumLayersDefine = CreateDefineFromSize(L"NUM_LAYERS", Config.NumLayers); auto BiasInterpretationEnumDefine = CreateDefineFromString( L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum); auto UseGroupsharedDefine = - CreateDefineFromInt(L"USE_GROUPSHARED", RunCompute ? 1 : 0); + CreateDefineFromSize(L"USE_GROUPSHARED", RunCompute ? 1 : 0); std::vector Options = { L"-enable-16bit-types", @@ -12626,13 +12629,13 @@ float4 ps_main() : SV_Target { }; std::vector StrideDefines; - for (int I = 0; I < Config.NumLayers; ++I) { + for (size_t I = 0; I < Config.NumLayers; ++I) { auto ConvertInfo = InputMatrices[I].getConversionInfo( D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout); wchar_t StrideName[16]; - swprintf(StrideName, _countof(StrideName), L"STRIDE%d", I); + swprintf(StrideName, _countof(StrideName), L"STRIDE%zu", I); StrideDefines.push_back( - CreateDefineFromInt(StrideName, ConvertInfo.DestInfo.DestStride)); + CreateDefineFromSize(StrideName, ConvertInfo.DestInfo.DestStride)); Options.push_back(StrideDefines[I].c_str()); } @@ -12688,7 +12691,7 @@ float4 ps_main() : SV_Target { Config.NumLayers); std::vector> InputMatrixSRVUploadResources( Config.NumLayers); - for (int I = 0; I < Config.NumLayers; ++I) { + for (size_t I = 0; I < Config.NumLayers; ++I) { CreateTestResources( D3DDevice, CommandList, InputMatrices[I].getBuffer(), InputMatrices[I].getTotalBytes(), @@ -12727,11 +12730,11 @@ float4 ps_main() : SV_Target { // Create converted matrix resource and SRV for each input matrix std::vector> ConvertedMatrixResources( Config.NumLayers); - for (int I = 0; I < Config.NumLayers; ++I) { + for (size_t I = 0; I < Config.NumLayers; ++I) { auto ConvertInfo = InputMatrices[I].getConversionInfo( D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout); - int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16; + UINT SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16; // Create resource to hold matrix copy CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize, @@ -12776,7 +12779,7 @@ float4 ps_main() : SV_Target { CreateTestUavs(D3DDevice, CommandList, OutputBufferInit.data(), OutputBufferSize, &UavResource, &UavUploadResource, &UavReadResource); - CreateRawUAV(D3DDevice, BaseHandle, OutputBufferSize / 4, UavResource); + CreateRawUAV(D3DDevice, BaseHandle, (UINT)OutputBufferSize / 4, UavResource); CommandList->Close(); ExecuteCommandList(CommandQueue, CommandList); @@ -12846,7 +12849,7 @@ float4 ps_main() : SV_Target { WaitForSignal(CommandQueue, FO); { - MappedData MappedData(UavReadResource, OutputBufferSize); + MappedData MappedData(UavReadResource, (UINT)OutputBufferSize); float *ResultBuffer = (float *)MappedData.data(); bool Equal = true; @@ -12865,8 +12868,8 @@ float4 ps_main() : SV_Target { MaxError = 3.0f; } - for (int i = 0; i < Config.NumThreads && Equal; ++i) { - for (int j = 0; j < Config.OutputPerThread; ++j) { + for (size_t i = 0; i < Config.NumThreads && Equal; ++i) { + for (size_t j = 0; j < Config.OutputPerThread; ++j) { float Result = ResultBuffer[i * Config.OutputPerThread + j]; float Expected = ExpectedOutput.getVector(i)[j]; if (isnan(Result) || isnan(Expected) || @@ -13056,13 +13059,13 @@ void ExecutionTest::runCoopVecOuterProductSubtest( ExpectedOutputBufferI8.size()); if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) { - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (size_t ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { auto *InputVector1FP16 = InputVector1.getVector(ThreadIdx); auto *InputVector2FP16 = InputVector2.getVector(ThreadIdx); - for (int M = 0; M < Config.DimM; ++M) { - for (int N = 0; N < Config.DimN; ++N) { + for (size_t M = 0; M < Config.DimM; ++M) { + for (size_t N = 0; N < Config.DimN; ++N) { float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) * ConvertFloat16ToFloat32(InputVector2FP16[N]); ExpectedOutputBuffer[M * Config.DimN + N] += acc; @@ -13071,11 +13074,11 @@ void ExecutionTest::runCoopVecOuterProductSubtest( } } else if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { + for (size_t ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) { auto *InputVector1FP32 = InputVector1.getVector(ThreadIdx); auto *InputVector2FP32 = InputVector2.getVector(ThreadIdx); - for (int M = 0; M < Config.DimM; ++M) { - for (int N = 0; N < Config.DimN; ++N) { + for (size_t M = 0; M < Config.DimM; ++M) { + for (size_t N = 0; N < Config.DimN; ++N) { float Acc = InputVector1FP32[M] * InputVector2FP32[N]; ExpectedOutputBuffer[M * Config.DimN + N] += Acc; } @@ -13142,7 +13145,7 @@ float4 ps_main() : SV_Target { } )"; - auto CreateDefineFromInt = [](const wchar_t *Name, int Value) { + auto CreateDefineFromSize = [](const wchar_t *Name, size_t Value) { std::wstringstream Stream; Stream << L"-D" << Name << L"=" << Value; return Stream.str(); @@ -13154,11 +13157,12 @@ float4 ps_main() : SV_Target { return Stream.str(); }; - int Stride = 0; + size_t Stride = 0; const std::wstring HlslMatrixLayout = CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout); - int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType( - AccumulateProps.AccumulationType); + size_t StrideMultiplier = + CoopVecHelpers::GetStrideMultiplierForMatrixDataType( + AccumulateProps.AccumulationType); switch (Config.MatrixLayout) { case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR: Stride = Config.DimN * StrideMultiplier; @@ -13168,8 +13172,9 @@ float4 ps_main() : SV_Target { break; } - const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType( - AccumulateProps.InputType); + const size_t InputDivisor = + CoopVecHelpers::GetNumPackedElementsForInputDataType( + AccumulateProps.InputType); const std::wstring InputDataType = CoopVecHelpers::GetHlslDataTypeForDataType(AccumulateProps.InputType); const std::wstring AccumDataType = CoopVecHelpers::GetHlslDataTypeForDataType( @@ -13181,14 +13186,15 @@ float4 ps_main() : SV_Target { CoopVecHelpers::GetHlslInterpretationForDataType( AccumulateProps.InputType); - auto DimMDefine = CreateDefineFromInt(L"DIM_M", Config.DimM); - auto DimNDefine = CreateDefineFromInt(L"DIM_N", Config.DimN); + auto DimMDefine = CreateDefineFromSize(L"DIM_M", Config.DimM); + auto DimNDefine = CreateDefineFromSize(L"DIM_N", Config.DimN); auto NumThreadsDefine = - CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads); - auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride); + CreateDefineFromSize(L"NUM_THREADS", Config.NumThreads); + auto StrideDefine = CreateDefineFromSize(L"STRIDE", Stride); auto InputDataTypeDefine = CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType.c_str()); - auto InputDivisorDefine = CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor); + auto InputDivisorDefine = + CreateDefineFromSize(L"INPUT_DIVISOR", InputDivisor); auto AccumDataTypeDefine = CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType.c_str()); auto InputInterpretationEnumDefine = CreateDefineFromString( @@ -13197,10 +13203,10 @@ float4 ps_main() : SV_Target { CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str()); auto MatrixDataTypeEnumDefine = CreateDefineFromString( L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str()); - auto InputVector1StrideDefine = CreateDefineFromInt( - L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride()); - auto InputVector2StrideDefine = CreateDefineFromInt( - L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride()); + auto InputVector1StrideDefine = + CreateDefineFromSize(L"INPUT_VECTOR_1_STRIDE", InputVector1.getStride()); + auto InputVector2StrideDefine = + CreateDefineFromSize(L"INPUT_VECTOR_2_STRIDE", InputVector2.getStride()); LPCWSTR Options[] = { L"-enable-16bit-types", @@ -13295,7 +13301,7 @@ float4 ps_main() : SV_Target { InputVecSRVResource2); CComPtr ConvertedMatrixResource, ConvertedMatrixReadResource; - int ConvertedMatrixSize = 0; + UINT ConvertedMatrixSize = 0; { // Create source matrix info D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_SRC_INFO SrcInfo = {}; @@ -13306,8 +13312,8 @@ float4 ps_main() : SV_Target { // Create destination matrix info D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO DestInfo = {}; DestInfo.DestSize = 0; // Will be populated by driver - int SrcEltSize = 0; - int DestEltSize = 0; + UINT SrcEltSize = 0; + UINT DestEltSize = 0; switch (AccumulateProps.AccumulationType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED: @@ -13331,19 +13337,19 @@ float4 ps_main() : SV_Target { DestEltSize = 1; // FP8 break; } - SrcInfo.SrcStride = Config.DimM * SrcEltSize; - SrcInfo.SrcSize = Config.DimM * Config.DimN * SrcEltSize; + SrcInfo.SrcStride = (UINT)(Config.DimM * SrcEltSize); + SrcInfo.SrcSize = (UINT)(Config.DimM * Config.DimN * SrcEltSize); DestInfo.DestLayout = Config.MatrixLayout; DestInfo.DestStride = 0; - DestInfo.NumRows = Config.DimM; - DestInfo.NumColumns = Config.DimN; + DestInfo.NumRows = (UINT)Config.DimM; + DestInfo.NumColumns = (UINT)Config.DimN; if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { - DestInfo.DestStride = Config.DimM * DestEltSize; + DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize); } else if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { - DestInfo.DestStride = Config.DimM * DestEltSize; + DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize); } // Create conversion info @@ -13483,8 +13489,8 @@ float4 ps_main() : SV_Target { ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver ConvertInfo.DestInfo.DestLayout = D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; - ConvertInfo.DestInfo.NumRows = Config.DimM; - ConvertInfo.DestInfo.NumColumns = Config.DimN; + ConvertInfo.DestInfo.NumRows = (UINT)Config.DimM; + ConvertInfo.DestInfo.NumColumns = (UINT)Config.DimN; if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32 || @@ -13495,10 +13501,10 @@ float4 ps_main() : SV_Target { AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32; - ConvertInfo.DestInfo.DestStride = Config.DimN * sizeof(float); + ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(float)); } else { ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8; - ConvertInfo.DestInfo.DestStride = Config.DimN * sizeof(int8_t); + ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(int8_t)); } // Get destination size using preview interface @@ -13549,7 +13555,7 @@ float4 ps_main() : SV_Target { float *ResultBuffer = (float *)MappedData.data(); bool Equal = true; - for (int i = 0; i < (UINT)InputMatrix.size() / sizeof(float); i++) { + for (size_t i = 0; i < (UINT)InputMatrix.size() / sizeof(float); i++) { if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) || fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) { LogErrorFmt(L"Result mismatch at index %d", i); From ed3744a6ce86a6c23a38b1d464b5cb2cfb737e9b Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 16:49:20 -0400 Subject: [PATCH 20/26] Address style/format --- tools/clang/unittests/HLSLExec/CoopVec.h | 152 ++++++++++-------- .../unittests/HLSLExec/ExecutionTest.cpp | 97 +++++------ 2 files changed, 133 insertions(+), 116 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index 689a4f214f..23a8dae170 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -369,7 +369,62 @@ bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32; } -struct TestVector { +static size_t +GetVectorElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataType, + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + switch (DataType) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + return sizeof(int8_t); + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + return sizeof(int16_t); + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) { + return sizeof(int8_t); + } else { + return sizeof(int32_t); + } + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + return sizeof(DirectX::PackedVector::HALF); + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + return sizeof(float); + default: + throw std::invalid_argument("Unsupported data type"); + } +} + +static size_t +GetMatrixElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { + switch (DataInterpretation) { + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: + case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + // The CPU reference matrix is always int8 for all integer + // interpretations. The GPU version will be converted to the destination + // format by ConvertLinearAlgebraMatrix. + return sizeof(int8_t); + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: + case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + // The CPU reference matrix is always FP32 for all FP interpretations. + // The GPU version will be converted to the destination format by + // ConvertLinearAlgebraMatrix. + return sizeof(float); + default: + throw std::invalid_argument("Unsupported data type"); + } +} + +class TestVector { private: size_t NumVectors = 0; size_t VectorSize = 0; @@ -390,7 +445,7 @@ struct TestVector { if (ElementSize == 0) throw std::invalid_argument("ElementSize must be greater than 0"); - size_t VectorBytes = VectorSize * ElementSize; + const size_t VectorBytes = VectorSize * ElementSize; Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment; TotalBytes = Stride * NumVectors; @@ -550,22 +605,21 @@ struct TestVector { if constexpr (std::is_same_v || std::is_same_v) { float Elt = 0.0f; - if (IsIntegralDataType(MatrixInterpretation)) { + + if (IsIntegralDataType(MatrixInterpretation)) Elt = (float)(Rnd() & 0x7) - 3.0f; - } else { + else Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f; - } - if constexpr (std::is_same_v) { + + if constexpr (std::is_same_v) Vec[J] = static_cast(ConvertFloat32ToFloat16(Elt)); - } else { + else Vec[J] = static_cast(Elt); - } } else { - if constexpr (std::is_signed_v) { + if constexpr (std::is_signed_v) Vec[J] = static_cast((int32_t)(Rnd() & 0xf) - 8); - } else { + else Vec[J] = static_cast((uint32_t)(Rnd() & 0xf)); - } } } } @@ -596,36 +650,9 @@ struct TestVector { D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation, D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation, std::mt19937 &Rnd) { - size_t ElementSize; - switch (DataType) { - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: - case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: - ElementSize = sizeof(int8_t); - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: - case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: - ElementSize = sizeof(int16_t); - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: - case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: - if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) { - ElementSize = sizeof(int8_t); - } else { - ElementSize = sizeof(int32_t); - } - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: - ElementSize = sizeof(DirectX::PackedVector::HALF); - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: - ElementSize = sizeof(float); - break; - default: - throw std::invalid_argument("Unsupported data type"); - } + const size_t ElementSize = + ::CoopVecHelpers::GetVectorElementSize(DataType, DataInterpretation); + TestVector Vec(NumVectors, VectorSize, ElementSize); switch (DataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: @@ -670,25 +697,9 @@ struct TestVector { createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize, D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation, std::mt19937 &Rnd) { - size_t ElementSize; - switch (DataInterpretation) { - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: - case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8: - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16: - case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: - case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: - case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: - ElementSize = sizeof(int8_t); - break; - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: - case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: - ElementSize = sizeof(float); - break; - default: - throw std::invalid_argument("Unsupported data type"); - } + const size_t ElementSize = + ::CoopVecHelpers::GetMatrixElementSize(DataInterpretation); + TestVector Vec(NumVectors, VectorSize, ElementSize); switch (DataInterpretation) { case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8: @@ -808,21 +819,20 @@ struct TestVector { for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize(); ++InputIdx) { float InputElem; - if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { + if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) InputElem = InputVector.getVector(VecIdx)[InputIdx]; - } else { + else InputElem = ConvertFloat16ToFloat32( InputVector.getVector( VecIdx)[InputIdx]); - } + float const MatrixElem = Matrix.getVector(OutputIdx)[InputIdx]; Acc += InputElem * MatrixElem; } - if (HasBias) { + if (HasBias) Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); - } float Result = Acc; ResultVec.getVector(VecIdx)[OutputIdx] = Result; @@ -838,19 +848,19 @@ struct TestVector { for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize(); ++InputIdx) { int InputElem; - if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) { - InputElem = (int)InputVector.getVector(VecIdx)[InputIdx]; - } else { + if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) + InputElem = static_cast( + InputVector.getVector(VecIdx)[InputIdx]); + else InputElem = InputVector.getVector(VecIdx)[InputIdx]; - } + int const MatrixElem = Matrix.getVector(OutputIdx)[InputIdx]; Acc += InputElem * MatrixElem; } - if (HasBias) { + if (HasBias) Acc += InputBiasI32[OutputIdx]; - } float Result = float(Acc); ResultVec.getVector(VecIdx)[OutputIdx] = Result; diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index afdde90029..f3dc75395e 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12013,9 +12013,9 @@ void ExecutionTest::runCoopVecMulTest() { #else // Create device and verify coopvec support CComPtr D3DDevice; - if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) { + if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) return; - } + if (!DoesDeviceSupportCooperativeVector(D3DDevice)) { WEX::Logging::Log::Comment( "Device does not support cooperative vector. Skipping."); @@ -12351,8 +12351,9 @@ void ExecutionTest::runCoopVecMulSubtest( std::mt19937 Rnd(0x42); LogCommentFmt( - L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: " - L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s, Stage: %s", + L"Running test for InputPerThread: %zu, OutputPerThread: %zu, " + L"NumThreads: " + L"%zu, NumLayers: %zu, Bias: %s, MatrixLayout: %s, Stage: %s", Config.InputPerThread, Config.OutputPerThread, Config.NumThreads, Config.NumLayers, Config.Bias ? L"true" : L"false", CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(), @@ -12365,8 +12366,8 @@ void ExecutionTest::runCoopVecMulSubtest( CComPtr RootSignature; { CD3DX12_DESCRIPTOR_RANGE Ranges[2]; - Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + (UINT)Config.NumLayers, - 0, + Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, + 2 + static_cast(Config.NumLayers), 0, 0); // InputVector, InputBias, InputMatrices[] Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer @@ -12387,7 +12388,7 @@ void ExecutionTest::runCoopVecMulSubtest( { D3D12_DESCRIPTOR_HEAP_DESC Desc = {}; Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV; - Desc.NumDescriptors = 3 + (UINT)Config.NumLayers; + Desc.NumDescriptors = 3 + static_cast(Config.NumLayers); Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE; VERIFY_SUCCEEDED( D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap))); @@ -12656,7 +12657,6 @@ float4 ps_main() : SV_Target { Options.data(), (int)Options.size(), IncludeHandler); D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {}; - // psoDesc.InputLayout; PsoDesc.pRootSignature = RootSignature; PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader); PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader); @@ -12711,7 +12711,7 @@ float4 ps_main() : SV_Target { // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector.getTotalBytes() / sizeof(int32_t)), + static_cast(InputVector.getTotalBytes() / sizeof(int32_t)), InputVecSRVResource); // Create input bias @@ -12724,7 +12724,7 @@ float4 ps_main() : SV_Target { // This increments baseHandle CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)), + static_cast(InputBias.getTotalBytes() / sizeof(int32_t)), InputBiasSRVResource); // Create converted matrix resource and SRV for each input matrix @@ -12779,7 +12779,8 @@ float4 ps_main() : SV_Target { CreateTestUavs(D3DDevice, CommandList, OutputBufferInit.data(), OutputBufferSize, &UavResource, &UavUploadResource, &UavReadResource); - CreateRawUAV(D3DDevice, BaseHandle, (UINT)OutputBufferSize / 4, UavResource); + CreateRawUAV(D3DDevice, BaseHandle, static_cast(OutputBufferSize / 4), + UavResource); CommandList->Close(); ExecuteCommandList(CommandQueue, CommandList); @@ -12813,12 +12814,12 @@ float4 ps_main() : SV_Target { D3D12_RECT ScissorRect; memset(&Viewport, 0, sizeof(Viewport)); - Viewport.Height = (float)RtDesc.Height; - Viewport.Width = (float)RtDesc.Width; + Viewport.Height = static_cast(RtDesc.Height); + Viewport.Width = static_cast(RtDesc.Width); Viewport.MaxDepth = 1.0f; memset(&ScissorRect, 0, sizeof(ScissorRect)); - ScissorRect.right = (long)RtDesc.Width; - ScissorRect.bottom = RtDesc.Height; + ScissorRect.right = static_cast(RtDesc.Width); + ScissorRect.bottom = static_cast(RtDesc.Height); CommandList->SetGraphicsRootSignature(RootSignature); CommandList->SetGraphicsRootDescriptorTable(0, ResHandle); CommandList->SetGraphicsRootUnorderedAccessView( @@ -12849,9 +12850,9 @@ float4 ps_main() : SV_Target { WaitForSignal(CommandQueue, FO); { - MappedData MappedData(UavReadResource, (UINT)OutputBufferSize); + MappedData MappedData(UavReadResource, static_cast(OutputBufferSize)); - float *ResultBuffer = (float *)MappedData.data(); + float *ResultBuffer = reinterpret_cast(MappedData.data()); bool Equal = true; float MaxError = 0.00001f; @@ -12874,7 +12875,7 @@ float4 ps_main() : SV_Target { float Expected = ExpectedOutput.getVector(i)[j]; if (isnan(Result) || isnan(Expected) || fabs(Result - Expected) > MaxError) { - LogErrorFmt(L"Result mismatch at vector %d, element %d", i, j); + LogErrorFmt(L"Result mismatch at vector %zu, element %zu", i, j); LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected); Equal = false; break; @@ -12901,9 +12902,9 @@ void ExecutionTest::runCoopVecOuterProductTest() { #else // Create device and verify coopvec support CComPtr D3DDevice; - if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) { + if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) return; - } + if (!DoesDeviceSupportCooperativeVector(D3DDevice)) { WEX::Logging::Log::Comment( "Device does not support cooperative vector. Skipping."); @@ -12981,7 +12982,8 @@ void ExecutionTest::runCoopVecOuterProductSubtest( std::mt19937 Rnd(0x42); LogCommentFmt( - L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s, " + L"Running test for DimM: %zu, DimN: %zu, NumThreads: %zu, MatrixLayout: " + L"%s, " L"Stage: %s", Config.DimM, Config.DimN, Config.NumThreads, CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(), @@ -13241,7 +13243,6 @@ float4 ps_main() : SV_Target { Options, _countof(Options), IncludeHandler); D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {}; - // psoDesc.InputLayout; PsoDesc.pRootSignature = RootSignature; PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader); PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader); @@ -13293,12 +13294,14 @@ float4 ps_main() : SV_Target { &InputVecSRVResource2, &InputVecSRVUploadResource2); // This increments baseHandle - CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector1.getTotalBytes() / sizeof(int32_t)), - InputVecSRVResource1); - CreateRawSRV(D3DDevice, BaseHandle, - (UINT)(InputVector2.getTotalBytes() / sizeof(int32_t)), - InputVecSRVResource2); + CreateRawSRV( + D3DDevice, BaseHandle, + static_cast(InputVector1.getTotalBytes() / sizeof(int32_t)), + InputVecSRVResource1); + CreateRawSRV( + D3DDevice, BaseHandle, + static_cast(InputVector2.getTotalBytes() / sizeof(int32_t)), + InputVecSRVResource2); CComPtr ConvertedMatrixResource, ConvertedMatrixReadResource; UINT ConvertedMatrixSize = 0; @@ -13337,19 +13340,19 @@ float4 ps_main() : SV_Target { DestEltSize = 1; // FP8 break; } - SrcInfo.SrcStride = (UINT)(Config.DimM * SrcEltSize); - SrcInfo.SrcSize = (UINT)(Config.DimM * Config.DimN * SrcEltSize); + SrcInfo.SrcStride = static_cast(Config.DimM * SrcEltSize); + SrcInfo.SrcSize = static_cast(Config.DimM * Config.DimN * SrcEltSize); DestInfo.DestLayout = Config.MatrixLayout; DestInfo.DestStride = 0; - DestInfo.NumRows = (UINT)Config.DimM; - DestInfo.NumColumns = (UINT)Config.DimN; + DestInfo.NumRows = static_cast(Config.DimM); + DestInfo.NumColumns = static_cast(Config.DimN); if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { - DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize); + DestInfo.DestStride = static_cast(Config.DimM * DestEltSize); } else if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { - DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize); + DestInfo.DestStride = static_cast(Config.DimM * DestEltSize); } // Create conversion info @@ -13439,12 +13442,12 @@ float4 ps_main() : SV_Target { D3D12_RECT ScissorRect; memset(&Viewport, 0, sizeof(Viewport)); - Viewport.Height = (float)RtDesc.Height; - Viewport.Width = (float)RtDesc.Width; + Viewport.Height = static_cast(RtDesc.Height); + Viewport.Width = static_cast(RtDesc.Width); Viewport.MaxDepth = 1.0f; memset(&ScissorRect, 0, sizeof(ScissorRect)); - ScissorRect.right = (long)RtDesc.Width; - ScissorRect.bottom = RtDesc.Height; + ScissorRect.right = static_cast(RtDesc.Width); + ScissorRect.bottom = static_cast(RtDesc.Height); CommandList->SetGraphicsRootSignature(RootSignature); CommandList->SetGraphicsRootDescriptorTable(0, ResHandle); CommandList->SetGraphicsRootUnorderedAccessView( @@ -13489,8 +13492,8 @@ float4 ps_main() : SV_Target { ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver ConvertInfo.DestInfo.DestLayout = D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR; - ConvertInfo.DestInfo.NumRows = (UINT)Config.DimM; - ConvertInfo.DestInfo.NumColumns = (UINT)Config.DimN; + ConvertInfo.DestInfo.NumRows = static_cast(Config.DimM); + ConvertInfo.DestInfo.NumColumns = static_cast(Config.DimN); if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32 || @@ -13501,10 +13504,12 @@ float4 ps_main() : SV_Target { AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) { ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32; - ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(float)); + ConvertInfo.DestInfo.DestStride = + static_cast(Config.DimN * sizeof(float)); } else { ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8; - ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(int8_t)); + ConvertInfo.DestInfo.DestStride = + static_cast(Config.DimN * sizeof(int8_t)); } // Get destination size using preview interface @@ -13551,15 +13556,17 @@ float4 ps_main() : SV_Target { WaitForSignal(CommandQueue, FO); { - MappedData MappedData(MatrixRowMajorReadResource, (UINT)InputMatrix.size()); + MappedData MappedData(MatrixRowMajorReadResource, + static_cast(InputMatrix.size())); float *ResultBuffer = (float *)MappedData.data(); bool Equal = true; - for (size_t i = 0; i < (UINT)InputMatrix.size() / sizeof(float); i++) { + for (size_t i = 0; + i < static_cast(InputMatrix.size() / sizeof(float)); i++) { if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) || fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) { LogErrorFmt(L"Result mismatch at index %d", i); - LogErrorFmt(L"ResultBuffer[%d]: %f, ExpectedOutputBuffer[%d]: %f", i, + LogErrorFmt(L"ResultBuffer[%zu]: %f, ExpectedOutputBuffer[%zu]: %f", i, ResultBuffer[i], i, ExpectedOutputBuffer[i]); Equal = false; break; From 3efd69a9d7a478e0a34677f95b5e4a084a453f89 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 16:52:03 -0400 Subject: [PATCH 21/26] Fix missing static_cast in CoopVec.h --- tools/clang/unittests/HLSLExec/CoopVec.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index 23a8dae170..1db1c0c19b 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -757,21 +757,21 @@ class TestVector { DestEltSize = 1; // FP8 break; } - ConvertInfo.SrcInfo.SrcStride = (UINT)getStride(); - ConvertInfo.SrcInfo.SrcSize = (UINT)getTotalBytes(); + ConvertInfo.SrcInfo.SrcStride = static_cast(getStride()); + ConvertInfo.SrcInfo.SrcSize = static_cast(getTotalBytes()); ConvertInfo.DestInfo.DestLayout = MatrixLayout; ConvertInfo.DestInfo.DestStride = 0; - ConvertInfo.DestInfo.NumRows = (UINT)getNumVectors(); - ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize(); + ConvertInfo.DestInfo.NumRows = static_cast(getNumVectors()); + ConvertInfo.DestInfo.NumColumns = static_cast(getVectorSize()); if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { ConvertInfo.DestInfo.DestStride = - ((UINT)getVectorSize() * DestEltSize + 15) & ~15; + (static_cast(getVectorSize()) * DestEltSize + 15) & ~15; } else if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { ConvertInfo.DestInfo.DestStride = - ((UINT)getNumVectors() * DestEltSize + 15) & ~15; + (static_cast(getNumVectors()) * DestEltSize + 15) & ~15; } // Get destination size using preview interface From f77d76f9aefd4b313b46ac0d591d2203a304194c Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Mon, 12 May 2025 16:57:25 -0400 Subject: [PATCH 22/26] Use proposed refactor for packed integer type exclusions in runCoopVecMulTestConfig --- .../unittests/HLSLExec/ExecutionTest.cpp | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index f3dc75395e..ad1912f46b 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12317,19 +12317,23 @@ void ExecutionTest::runCoopVecMulTestConfig( continue; } - if (Config.NumLayers > 1 && - (MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || - MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 || - MulProps.InputInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || - MulProps.InputInterpretation == - D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) && - (MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 || - MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32)) { - // We do not support multi-layer tests with packed types as input with - // full-precision integer bias Supporting this in the current framework - // would require repacking the accumulator vectors - continue; + if (Config.NumLayers > 1) { + const bool IsPackedType = + MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || + MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 || + MulProps.InputInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED || + MulProps.InputInterpretation == + D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED; + + const bool IsFullPrecisionIntegerBias = + MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 || + MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32; + + if (IsPackedType && IsFullPrecisionIntegerBias) + // In the current framework this would require repacking the accumulator + // vectors in HLSL. + continue; } bool IsInFilter = CoopVecHelpers::IsMatrixLayoutInFilter( From 1e4662c46ac0b2224a4004b4c809f2a1e94cbe53 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 13 May 2025 09:02:06 -0400 Subject: [PATCH 23/26] Style fixes and rewrite of TestVector to use unique_ptr --- tools/clang/unittests/HLSLExec/CoopVec.h | 199 +++++------------- .../unittests/HLSLExec/ExecutionTest.cpp | 36 +++- 2 files changed, 83 insertions(+), 152 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index 1db1c0c19b..e810117109 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -13,7 +14,7 @@ #include "CoopVecAPI.h" -struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler { +class LinAlgHeaderIncludeHandler : public IDxcIncludeHandler { private: DXC_MICROCOM_REF_FIELD(RefCount) dxc::DxcDllSupport &DxcSupport; @@ -32,6 +33,8 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler { WEX::Common::String ParamValue; if (FAILED(WEX::TestExecution::RuntimeParameters::TryGetValue( L"LinAlgHeader", ParamValue))) { + WEX::Logging::Log::Error( + L"Missing expected TAEF runtime parameter LinAlgHeader"); return E_FAIL; } @@ -79,7 +82,7 @@ static std::vector CreateAllOnesInputMatrix(size_t Width, } else if constexpr (std::is_same_v) { InputMatrix[i] = 1.0f; } else { - WEX::Logging::Log::Error(L"Unsupported input type"); + VERIFY_FAIL(L"Unsupported input type"); break; } } @@ -91,60 +94,6 @@ static std::vector CreateAllOnesInputMatrix(size_t Width, return Uint8InputMatrix; } -template -static std::vector CreateInputVector(size_t NumThreads, - size_t EltsPerThread) { - std::vector InputVector(NumThreads * EltsPerThread); - std::fill(InputVector.begin(), InputVector.end(), EltTy(0)); - if (EltsPerThread < 2) { - WEX::Logging::Log::Error(L"EltsPerThread must be at least 2"); - return std::vector(); - } - for (size_t TID = 0; TID < NumThreads; TID++) { - if constexpr (std::is_same_v || - std::is_same_v) { - InputVector[TID * EltsPerThread + 0] = 1; - InputVector[TID * EltsPerThread + 1] = 1; - } else if constexpr (std::is_same_v) { - InputVector[TID * EltsPerThread + 0] = ConvertFloat32ToFloat16(1.0f); - InputVector[TID * EltsPerThread + 1] = ConvertFloat32ToFloat16(1.0f); - } else if constexpr (std::is_same_v) { - InputVector[TID * EltsPerThread + 0] = 1.0f; - InputVector[TID * EltsPerThread + 1] = 1.0f; - } else { - WEX::Logging::Log::Error(L"Unsupported input type"); - break; - } - } - - // Convert to uint8_t vector - std::vector Uint8InputVector(InputVector.size() * sizeof(EltTy)); - std::memcpy(Uint8InputVector.data(), InputVector.data(), - InputVector.size() * sizeof(EltTy)); - return Uint8InputVector; -} - -template -static std::vector CreateInputBias(size_t NumElts) { - std::vector InputBias(NumElts); - if constexpr (std::is_same_v || - std::is_same_v) { - std::fill(InputBias.begin(), InputBias.end(), EltTy(1)); - } else if constexpr (std::is_same_v) { - std::fill(InputBias.begin(), InputBias.end(), - ConvertFloat32ToFloat16(1.0f)); - } else if constexpr (std::is_same_v) { - std::fill(InputBias.begin(), InputBias.end(), 1); - } else { - WEX::Logging::Log::Error(L"Unsupported bias type"); - } - // Convert to uint8_t vector - std::vector Uint8InputBias(InputBias.size() * sizeof(EltTy)); - std::memcpy(Uint8InputBias.data(), InputBias.data(), - InputBias.size() * sizeof(EltTy)); - return Uint8InputBias; -} - static std::wstring DataTypeToFilterString(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { switch (DataType) { @@ -173,7 +122,9 @@ DataTypeToFilterString(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: return L"FLOAT_E5M2"; default: - return L""; + VERIFY_FAIL(WEX::Common::String().Format( + L"Unrecognized D3D12_LINEAR_ALGEBRA_DATATYPE: %d", DataType)); + return L""; } } @@ -266,7 +217,7 @@ GetStrideMultiplierForMatrixDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: return 4; default: - WEX::Logging::Log::Error(L"Unsupported matrix data type"); + VERIFY_FAIL(L"Unsupported matrix data type"); return 1; } } @@ -302,8 +253,8 @@ GetHlslDataTypeForDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: return L"float"; default: - WEX::Logging::Log::Error(L"Unsupported input data type"); - return L""; + VERIFY_FAIL(L"Unsupported input data type"); + return L""; } } @@ -335,8 +286,8 @@ GetHlslInterpretationForDataType(D3D12_LINEAR_ALGEBRA_DATATYPE Interpretation) { case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: return L"DATA_TYPE_FLOAT8_E5M2"; default: - WEX::Logging::Log::Error(L"Unsupported interpretation"); - return L""; + VERIFY_FAIL(L"Unsupported interpretation"); + return L""; } } @@ -360,7 +311,7 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) { } } -bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { +static bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) { return DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 || DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 || DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16 || @@ -394,7 +345,8 @@ GetVectorElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataType, case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: return sizeof(float); default: - throw std::invalid_argument("Unsupported data type"); + VERIFY_FAIL(L"Unsupported data type"); + return 0; } } @@ -420,7 +372,8 @@ GetMatrixElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) { // ConvertLinearAlgebraMatrix. return sizeof(float); default: - throw std::invalid_argument("Unsupported data type"); + VERIFY_FAIL(L"Unsupported data type"); + return 0; } } @@ -431,7 +384,7 @@ class TestVector { size_t ElementSize = 0; size_t Stride = 0; size_t TotalBytes = 0; - uint8_t *Buffer = nullptr; + std::unique_ptr Buffer; public: TestVector(size_t NumVectors, size_t VectorSize, size_t ElementSize, @@ -439,24 +392,18 @@ class TestVector { : NumVectors(NumVectors), VectorSize(VectorSize), ElementSize(ElementSize) { if (NumVectors == 0) - throw std::invalid_argument("NumVectors must be greater than 0"); + VERIFY_FAIL(L"NumVectors must be greater than 0"); if (VectorSize == 0) - throw std::invalid_argument("VectorSize must be greater than 0"); + VERIFY_FAIL(L"VectorSize must be greater than 0"); if (ElementSize == 0) - throw std::invalid_argument("ElementSize must be greater than 0"); + VERIFY_FAIL(L"ElementSize must be greater than 0"); const size_t VectorBytes = VectorSize * ElementSize; Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment; TotalBytes = Stride * NumVectors; - void *Ptr = nullptr; -#ifdef _MSC_VER - Ptr = _aligned_malloc(TotalBytes, Alignment); -#else - Ptr = std::aligned_alloc(Alignment, TotalBytes); -#endif - Buffer = reinterpret_cast(Ptr); - std::fill(Buffer, Buffer + TotalBytes, (uint8_t)0xFF); + Buffer = std::make_unique(TotalBytes); + std::fill(Buffer.get(), Buffer.get() + TotalBytes, (uint8_t)0xFF); } // Copy constructor @@ -464,17 +411,9 @@ class TestVector { : NumVectors(other.NumVectors), VectorSize(other.VectorSize), ElementSize(other.ElementSize), Stride(other.Stride), TotalBytes(other.TotalBytes) { - - void *Ptr = nullptr; -#ifdef _MSC_VER - Ptr = _aligned_malloc(TotalBytes, 16); -#else - Ptr = std::aligned_alloc(16, TotalBytes); -#endif - Buffer = reinterpret_cast(Ptr); - if (other.Buffer) { - std::memcpy(Buffer, other.Buffer, TotalBytes); + Buffer = std::make_unique(TotalBytes); + std::memcpy(Buffer.get(), other.Buffer.get(), TotalBytes); } } @@ -482,48 +421,28 @@ class TestVector { TestVector(TestVector &&other) noexcept : NumVectors(other.NumVectors), VectorSize(other.VectorSize), ElementSize(other.ElementSize), Stride(other.Stride), - TotalBytes(other.TotalBytes), Buffer(other.Buffer) { - + TotalBytes(other.TotalBytes), Buffer(std::move(other.Buffer)) { // Reset the source object other.NumVectors = 0; other.VectorSize = 0; other.ElementSize = 0; other.Stride = 0; other.TotalBytes = 0; - other.Buffer = nullptr; } - ~TestVector() { - if (Buffer) { -#ifdef _MSC_VER - _aligned_free(Buffer); -#else - std::free(Buffer); -#endif - } - } + ~TestVector() = default; size_t getNumVectors() const { return NumVectors; } size_t getVectorSize() const { return VectorSize; } size_t getElementSize() const { return ElementSize; } size_t getStride() const { return Stride; } size_t getTotalBytes() const { return TotalBytes; } - uint8_t *getBuffer() { return Buffer; } - const uint8_t *getBuffer() const { return Buffer; } + uint8_t *getBuffer() { return Buffer.get(); } + const uint8_t *getBuffer() const { return Buffer.get(); } // Copy assignment operator TestVector &operator=(const TestVector &other) { if (this != &other) { - // Free existing buffer - if (Buffer) { -#ifdef _MSC_VER - _aligned_free(Buffer); -#else - std::free(Buffer); -#endif - Buffer = nullptr; - } - // Copy metadata NumVectors = other.NumVectors; VectorSize = other.VectorSize; @@ -531,18 +450,13 @@ class TestVector { Stride = other.Stride; TotalBytes = other.TotalBytes; - // Allocate new buffer - void *Ptr = nullptr; -#ifdef _MSC_VER - Ptr = _aligned_malloc(TotalBytes, 16); -#else - Ptr = std::aligned_alloc(16, TotalBytes); -#endif - Buffer = reinterpret_cast(Ptr); - // Copy data - if (other.Buffer) - std::memcpy(Buffer, other.Buffer, TotalBytes); + if (other.Buffer) { + Buffer = std::make_unique(TotalBytes); + std::memcpy(Buffer.get(), other.Buffer.get(), TotalBytes); + } else { + Buffer.reset(); + } } return *this; } @@ -550,22 +464,13 @@ class TestVector { // Move assignment operator TestVector &operator=(TestVector &&other) noexcept { if (this != &other) { - // Free existing buffer - if (Buffer) { -#ifdef _MSC_VER - _aligned_free(Buffer); -#else - std::free(Buffer); -#endif - } - // Move metadata and buffer NumVectors = other.NumVectors; VectorSize = other.VectorSize; ElementSize = other.ElementSize; Stride = other.Stride; TotalBytes = other.TotalBytes; - Buffer = other.Buffer; + Buffer = std::move(other.Buffer); // Reset the source object other.NumVectors = 0; @@ -573,19 +478,16 @@ class TestVector { other.ElementSize = 0; other.Stride = 0; other.TotalBytes = 0; - other.Buffer = nullptr; } return *this; } template T *getVector(size_t I) { - uint8_t *Ptr = Buffer + I * Stride; - return reinterpret_cast(Ptr); + return reinterpret_cast(Buffer.get() + I * Stride); } template const T *getVector(size_t I) const { - const uint8_t *Ptr = Buffer + I * Stride; - return reinterpret_cast(Ptr); + return reinterpret_cast(Buffer.get() + I * Stride); } template void fill(const T &Value) { @@ -607,9 +509,9 @@ class TestVector { float Elt = 0.0f; if (IsIntegralDataType(MatrixInterpretation)) - Elt = (float)(Rnd() & 0x7) - 3.0f; + Elt = static_cast(Rnd() & 0x7) - 3.0f; else - Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f; + Elt = (static_cast(Rnd() & 0x3) - 1.0f) / 2.0f; if constexpr (std::is_same_v) Vec[J] = static_cast(ConvertFloat32ToFloat16(Elt)); @@ -629,10 +531,10 @@ class TestVector { T *Vec = getVector(I); for (size_t J = 0; J < VectorSize; ++J) if constexpr (std::is_same_v) { - float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f; + float Elt = (static_cast(Rnd() & 0x3) - 1.0f) / 2.0f; Vec[J] = static_cast(ConvertFloat32ToFloat16(Elt)); } else if constexpr (std::is_same_v) { - float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f; + float Elt = (static_cast(Rnd() & 0x3) - 1.0f) / 2.0f; Vec[J] = static_cast(Elt); } else { if constexpr (std::is_signed_v) { @@ -688,7 +590,8 @@ class TestVector { Vec.fillSimpleTestData(MatrixInterpretation, Rnd); break; default: - throw std::invalid_argument("Unsupported data type"); + VERIFY_FAIL(L"Unsupported data type"); + break; } return Vec; } @@ -717,7 +620,8 @@ class TestVector { Vec.FillSimpleMatrixTestData(Rnd); break; default: - throw std::invalid_argument("Unsupported data type"); + VERIFY_FAIL(L"Unsupported data type"); + break; } return Vec; } @@ -834,8 +738,7 @@ class TestVector { if (HasBias) Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]); - float Result = Acc; - ResultVec.getVector(VecIdx)[OutputIdx] = Result; + ResultVec.getVector(VecIdx)[OutputIdx] = Acc; } } } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) { @@ -862,12 +765,12 @@ class TestVector { if (HasBias) Acc += InputBiasI32[OutputIdx]; - float Result = float(Acc); - ResultVec.getVector(VecIdx)[OutputIdx] = Result; + ResultVec.getVector(VecIdx)[OutputIdx] = + static_cast(Acc); } } } else { - throw std::invalid_argument("Unsupported matrix interpretation"); + VERIFY_FAIL(L"Unsupported matrix interpretation"); } return ResultVec; diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index ad1912f46b..f29ecb8fb4 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12013,14 +12013,28 @@ void ExecutionTest::runCoopVecMulTest() { #else // Create device and verify coopvec support CComPtr D3DDevice; - if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) + if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) { +#ifdef _HLK_CONF + LOG_ERROR_FMT_THROW( + L"Device does not support SM 6.9. Can't run these tests."); +#else + WEX::Logging::Log::Comment( + "Device does not support SM 6.9. Can't run these tests."); + WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped); return; +#endif + } if (!DoesDeviceSupportCooperativeVector(D3DDevice)) { +#ifdef _HLK_CONF + LOG_ERROR_FMT_THROW( + L"Device does not support cooperative vectors. Can't run these tests."); +#else WEX::Logging::Log::Comment( - "Device does not support cooperative vector. Skipping."); + "Device does not support cooperative vectors. Can't run these tests."); WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped); return; +#endif } // Query coopvec feature data. First call gets the size of the arrays. The @@ -12906,14 +12920,28 @@ void ExecutionTest::runCoopVecOuterProductTest() { #else // Create device and verify coopvec support CComPtr D3DDevice; - if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) + if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) { +#ifdef _HLK_CONF + LOG_ERROR_FMT_THROW( + L"Device does not support SM 6.9. Can't run these tests."); +#else + WEX::Logging::Log::Comment( + "Device does not support SM 6.9. Can't run these tests."); + WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped); return; +#endif + } if (!DoesDeviceSupportCooperativeVector(D3DDevice)) { +#ifdef _HLK_CONF + LOG_ERROR_FMT_THROW( + L"Device does not support cooperative vectors. Can't run these tests."); +#else WEX::Logging::Log::Comment( - "Device does not support cooperative vector. Skipping."); + "Device does not support cooperative vectors. Can't run these tests."); WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped); return; +#endif } // Query coopvec feature data. First call gets the size of the arrays. The From 44b1ce9c8e9a9c6308a1757e7990cd5967d56697 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 13 May 2025 09:10:20 -0400 Subject: [PATCH 24/26] Move PipelineState closer to first use --- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index f29ecb8fb4..ccd6f2b8c5 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12446,9 +12446,6 @@ void ExecutionTest::runCoopVecMulSubtest( I == 0 ? MulProps.InputType : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32); } - // Create the compute pipeline state for the CoopVec shader - CComPtr PipelineState; - std::string ShaderSource = R"( #include "dx/linalg.h" @@ -12661,6 +12658,9 @@ float4 ps_main() : SV_Target { CComPtr IncludeHandler = new LinAlgHeaderIncludeHandler(m_support); + // Create the pipeline state for the CoopVec shaders + CComPtr PipelineState; + if (RunCompute) { CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9", &PipelineState, Options.data(), (int)Options.size(), @@ -13123,9 +13123,6 @@ void ExecutionTest::runCoopVecOuterProductSubtest( return; } - // Create a compute pipeline state object. - CComPtr PipelineState; - std::string ShaderSource = R"( #include "dx/linalg.h" @@ -13261,6 +13258,9 @@ float4 ps_main() : SV_Target { CComPtr IncludeHandler = new LinAlgHeaderIncludeHandler(m_support); + // Create the pipeline state for the CoopVec shaders + CComPtr PipelineState; + if (RunCompute) { CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9", &PipelineState, Options, _countof(Options), From 1275bd1f2f3a6776d39c03fd2e85930638d9cba2 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 13 May 2025 09:15:44 -0400 Subject: [PATCH 25/26] Rename creatAllOnesTestMatrix to createSimpleTestMatrix to reflect its current implementation --- tools/clang/unittests/HLSLExec/CoopVec.h | 12 +++++++++--- tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index e810117109..294e63df5e 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -597,9 +597,9 @@ class TestVector { } static TestVector - createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize, - D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation, - std::mt19937 &Rnd) { + createSimpleTestMatrix(size_t NumVectors, size_t VectorSize, + D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation, + std::mt19937 &Rnd) { const size_t ElementSize = ::CoopVecHelpers::GetMatrixElementSize(DataInterpretation); @@ -611,12 +611,18 @@ class TestVector { case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16: case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32: case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32: + // The CPU reference matrix is always int8 for all integer + // interpretations. The GPU version will be converted to the destination + // format by ConvertLinearAlgebraMatrix. Vec.FillSimpleMatrixTestData(Rnd); break; case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16: case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32: + // The CPU reference matrix is always FP32 for all FP interpretations. + // The GPU version will be converted to the destination format by + // ConvertLinearAlgebraMatrix. Vec.FillSimpleMatrixTestData(Rnd); break; default: diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp index ccd6f2b8c5..b54ebe6f95 100644 --- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp +++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp @@ -12420,12 +12420,12 @@ void ExecutionTest::runCoopVecMulSubtest( for (size_t I = 0; I < Config.NumLayers - 1; ++I) { // Each layer except the last is InputPerThread x InputPerThread InputMatrices.push_back( - ::CoopVecHelpers::TestVector::createAllOnesTestMatrix( + ::CoopVecHelpers::TestVector::createSimpleTestMatrix( Config.InputPerThread, Config.InputPerThread, MulProps.MatrixInterpretation, Rnd)); } // Last layer, matrix size is OutputPerThread x InputPerThread - InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix( + InputMatrices.push_back(::CoopVecHelpers::TestVector::createSimpleTestMatrix( Config.OutputPerThread, Config.InputPerThread, MulProps.MatrixInterpretation, Rnd)); From 237e0fb8457f9ae2d71e4ad4b1fa80a5df965c53 Mon Sep 17 00:00:00 2001 From: Justin Holewinski Date: Tue, 13 May 2025 14:33:37 -0400 Subject: [PATCH 26/26] Add comments for some magic numbers --- tools/clang/unittests/HLSLExec/CoopVec.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h index 294e63df5e..c5c81800ac 100644 --- a/tools/clang/unittests/HLSLExec/CoopVec.h +++ b/tools/clang/unittests/HLSLExec/CoopVec.h @@ -508,6 +508,9 @@ class TestVector { std::is_same_v) { float Elt = 0.0f; + // Generate random input in the following ranges: + // - Integral types: [-3, 4] by 1 + // - FP types: [-0.5, 1] by 0.5 if (IsIntegralDataType(MatrixInterpretation)) Elt = static_cast(Rnd() & 0x7) - 3.0f; else @@ -518,6 +521,9 @@ class TestVector { else Vec[J] = static_cast(Elt); } else { + // Generate random input in the following ranges: + // - Signed types: [-8, 7] by 1 + // - Unsigned types: [0, 15] by 1 if constexpr (std::is_signed_v) Vec[J] = static_cast((int32_t)(Rnd() & 0xf) - 8); else @@ -676,10 +682,12 @@ class TestVector { ConvertInfo.DestInfo.NumColumns = static_cast(getVectorSize()); if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) { + // Align to 16 bytes ConvertInfo.DestInfo.DestStride = (static_cast(getVectorSize()) * DestEltSize + 15) & ~15; } else if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) { + // Align to 16 bytes ConvertInfo.DestInfo.DestStride = (static_cast(getNumVectors()) * DestEltSize + 15) & ~15; }