From d59ea37784bb386615abef7fce80de2272b25a27 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 6 May 2025 14:40:37 -0400
Subject: [PATCH 01/26] Clean up vector handling code by introducing TestVector

---
 tools/clang/unittests/HLSLExec/CoopVec.h      | 200 +++++++
 .../unittests/HLSLExec/ExecutionTest.cpp      | 532 +++++++-----------
 2 files changed, 416 insertions(+), 316 deletions(-)
diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index f166c61f67..cd24a556bd 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -4,6 +4,8 @@
 
 #include <DirectXMath.h>
 #include <DirectXPackedVector.h>
+
+#include <cstdlib>
 #include <vector>
 
 #include "dxc/Support/microcom.h"
@@ -61,6 +63,7 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler {
 };
 
 namespace CoopVecHelpers {
+
 template <typename EltTy>
 static std::vector<uint8_t> CreateAllOnesInputMatrix(uint32_t Width,
                                                      uint32_t Height) {
@@ -354,6 +357,203 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) {
     return D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32;
   }
 }
+
+struct TestVector {
+private:
+  size_t NumVectors = 0;
+  size_t VectorSize = 0;
+  size_t ElementSize = 0;
+  size_t Stride = 0;
+  size_t TotalBytes = 0;
+  uint8_t *Buffer = nullptr;
+
+public:
+  TestVector(size_t NumVectors, size_t VectorSize, size_t ElementSize,
+             size_t Alignment = 16)
+      : NumVectors(NumVectors), VectorSize(VectorSize),
+        ElementSize(ElementSize) {
+    if (NumVectors == 0) {
+      throw std::invalid_argument("NumVectors must be greater than 0");
+    }
+    if (VectorSize == 0) {
+      throw std::invalid_argument("VectorSize must be greater than 0");
+    }
+    if (ElementSize == 0) {
+      throw std::invalid_argument("ElementSize must be greater than 0");
+    }
+
+    size_t VectorBytes = VectorSize * ElementSize;
+    Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment;
+    TotalBytes = Stride * NumVectors;
+
+    void *Ptr = nullptr;
+#ifdef _MSC_VER
+    Ptr = _aligned_malloc(TotalBytes, Alignment);
+#else
+    Ptr = std::aligned_alloc(Alignment, TotalBytes);
+#endif
+    Buffer = reinterpret_cast<uint8_t *>(Ptr);
+    std::fill(Buffer, Buffer + TotalBytes, (uint8_t)0xFF);
+  }
+
+  // Copy constructor
+  TestVector(const TestVector &other)
+      : NumVectors(other.NumVectors), VectorSize(other.VectorSize),
+        ElementSize(other.ElementSize), Stride(other.Stride),
+        TotalBytes(other.TotalBytes) {
+
+    void *Ptr = nullptr;
+#ifdef _MSC_VER
+    Ptr = _aligned_malloc(TotalBytes, 16);
+#else
+    Ptr = std::aligned_alloc(16, TotalBytes);
+#endif
+    Buffer = reinterpret_cast<uint8_t *>(Ptr);
+
+    if (other.Buffer) {
+      std::memcpy(Buffer, other.Buffer, TotalBytes);
+    }
+  }
+
+  // Move constructor
+  TestVector(TestVector &&other) noexcept
+      : NumVectors(other.NumVectors), VectorSize(other.VectorSize),
+        ElementSize(other.ElementSize), Stride(other.Stride),
+        TotalBytes(other.TotalBytes), Buffer(other.Buffer) {
+
+    // Reset the source object
+    other.NumVectors = 0;
+    other.VectorSize = 0;
+    other.ElementSize = 0;
+    other.Stride = 0;
+    other.TotalBytes = 0;
+    other.Buffer = nullptr;
+  }
+
+  ~TestVector() {
+    if (Buffer) {
+#ifdef _MSC_VER
+      _aligned_free(Buffer);
+#else
+      std::free(Buffer);
+#endif
+    }
+  }
+
+  size_t getNumVectors() const { return NumVectors; }
+  size_t getVectorSize() const { return VectorSize; }
+  size_t getElementSize() const { return ElementSize; }
+  size_t getStride() const { return Stride; }
+  size_t getTotalBytes() const { return TotalBytes; }
+  uint8_t *getBuffer() { return Buffer; }
+  const uint8_t *getBuffer() const { return Buffer; }
+
+  template <typename T> T *getVector(size_t I) {
+    uint8_t *Ptr = Buffer + I * Stride;
+    return reinterpret_cast<T *>(Ptr);
+  }
+
+  template <typename T> const T *getVector(size_t I) const {
+    const uint8_t *Ptr = Buffer + I * Stride;
+    return reinterpret_cast<const T *>(Ptr);
+  }
+
+  template <typename T> void fill(const T &Value) {
+    for (size_t I = 0; I < NumVectors; ++I) {
+      T *Vec = getVector<T>(I);
+      for (size_t J = 0; J < VectorSize; ++J)
+        Vec[J] = Value;
+    }
+  }
+
+  template <typename T> void fillSimpleTestData() {
+    // Create a vector of (1, 1, 0, ...)
+    for (size_t I = 0; I < NumVectors; ++I) {
+      T *Vec = getVector<T>(I);
+      for (size_t J = 0; J < VectorSize; ++J)
+        if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
+          // Special case for HALF, which requires conversion from float
+          Vec[J] = static_cast<T>(
+              ConvertFloat32ToFloat16((J == 0 || J == 1) ? 1.0f : 0.0f));
+        } else {
+          Vec[J] = static_cast<T>((J == 0 || J == 1) ? 1 : 0);
+        }
+    }
+  }
+
+  static TestVector
+  createSimpleTestVector(size_t NumVectors, size_t VectorSize,
+                         D3D12_LINEAR_ALGEBRA_DATATYPE DataType,
+                         D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+    size_t ElementSize;
+    switch (DataType) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+      ElementSize = sizeof(int8_t);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+      ElementSize = sizeof(int16_t);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+          DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) {
+        ElementSize = sizeof(int8_t);
+      } else {
+        ElementSize = sizeof(int32_t);
+      }
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+      ElementSize = sizeof(DirectX::PackedVector::HALF);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      ElementSize = sizeof(float);
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    TestVector Vec(NumVectors, VectorSize, ElementSize);
+    switch (DataType) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+      Vec.fillSimpleTestData<int8_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+      Vec.fillSimpleTestData<uint8_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+      Vec.fillSimpleTestData<int16_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+      Vec.fillSimpleTestData<uint16_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+      Vec.fillSimpleTestData<int32_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+          DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) {
+        Vec.fillSimpleTestData<uint8_t>();
+      } else {
+        Vec.fillSimpleTestData<uint32_t>();
+      }
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+      Vec.fillSimpleTestData<DirectX::PackedVector::HALF>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      Vec.fillSimpleTestData<float>();
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    return Vec;
+  }
+};
 }; // namespace CoopVecHelpers
 
 #endif // HAVE_COOPVEC_API
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 55d569dd8d..f47b4624d6 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12241,6 +12241,112 @@ void ExecutionTest::runCoopVecMulSubtest(
   CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle(
       DescriptorHeap->GetCPUDescriptorHandleForHeapStart());
 
+  // Setup input data
+  auto ExpectedOutputBuffer =
+      std::make_unique<float[]>(Config.OutputPerThread * Config.NumThreads);
+
+  std::vector<uint8_t> InputMatrix;
+  if (MulProps.MatrixInterpretation ==
+          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+      MulProps.MatrixInterpretation ==
+          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
+      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(
+        Config.InputPerThread, Config.OutputPerThread);
+  } else if (MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
+    // Matrix source data is fp32, which gets converted to fp16 during matrix
+    // conversion
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(
+        Config.InputPerThread, Config.OutputPerThread);
+  } else {
+    WEX::Logging::Log::Error(L"Unsupported matrix data type");
+    return;
+  }
+
+  auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector(
+      Config.NumThreads, Config.InputPerThread, MulProps.InputType,
+      MulProps.InputInterpretation);
+  auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector(
+      1, Config.OutputPerThread, MulProps.BiasInterpretation,
+      MulProps.BiasInterpretation);
+
+  // Calculate reference output
+  // FIXME: This does not capture all cases, but is sufficient for the preview
+  // feature set
+  if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
+    int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer();
+    float *InputVectorF32 = (float *)InputVector.getBuffer();
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
+        int Acc = 0;
+
+        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
+          int InputElem;
+          if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+            InputElem = (int)
+                InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx];
+          } else {
+            InputElem = InputVector.getVector<int8_t>(ThreadIdx)[InputIdx];
+          }
+          int const MatrixElem =
+              InputMatrix[OutputIdx * Config.InputPerThread + InputIdx];
+          Acc += InputElem * MatrixElem;
+        }
+
+        if (Config.Bias) {
+          Acc += InputBiasI32[OutputIdx];
+        }
+
+        float Result = float(Acc);
+        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
+            Result;
+      }
+    }
+  } else if (MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
+    DirectX::PackedVector::HALF *InputVectorFP16 =
+        (DirectX::PackedVector::HALF *)InputVector.getBuffer();
+    DirectX::PackedVector::HALF *InputBiasFP16 =
+        (DirectX::PackedVector::HALF *)InputBias.getBuffer();
+
+    // The CPU reference matrix is float
+    std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
+    std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
+        float Acc = 0;
+
+        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
+          float const InputElem = ConvertFloat16ToFloat32(
+              InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]);
+          float const MatrixElem =
+              InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
+          Acc += InputElem * MatrixElem;
+        }
+
+        if (Config.Bias) {
+          Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
+        }
+
+        float Result = Acc;
+        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
+            Result;
+      }
+    }
+  }
+
   // Create the compute pipeline state for the CoopVec shader
   CComPtr<ID3D12PipelineState> ComputePipelineState;
   {
@@ -12258,9 +12364,7 @@ void main(uint threadIdx : SV_GroupThreadID)
 {
   using namespace dx::linalg;
 
-  // Ensure 4-byte alignment for vector loads
-  uint inputOffset = (INPUT_PER_THREAD * threadIdx * (sizeof(INPUT_DATA_TYPE) / INPUT_DIVISOR));
-  inputOffset = (inputOffset + 3) & ~3; // Align to 4 bytes
+  uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE);
   vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> >(inputOffset);
 
   MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix, 0, STRIDE };
@@ -12278,7 +12382,6 @@ void main(uint threadIdx : SV_GroupThreadID)
 
   // Ensure 4-byte alignment for vector store
   uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float);
-  outputOffset = (outputOffset + 3) & ~3; // Align to 4 bytes
   OutputBuffer.Store<vector<float, OUTPUT_PER_THREAD> >(outputOffset, result);
 }
     )";
@@ -12349,6 +12452,8 @@ void main(uint threadIdx : SV_GroupThreadID)
     auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0);
     auto AccumInterpretationEnumDefine = CreateDefineFromString(
         L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum);
+    auto InputVectorStrideDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride());
 
     LPCWSTR Options[] = {
         L"-enable-16bit-types",
@@ -12364,6 +12469,7 @@ void main(uint threadIdx : SV_GroupThreadID)
         MatrixDataTypeEnumDefine.c_str(),
         UseBiasDefine.c_str(),
         AccumInterpretationEnumDefine.c_str(),
+        InputVectorStrideDefine.c_str(),
     };
 
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
@@ -12388,36 +12494,9 @@ void main(uint threadIdx : SV_GroupThreadID)
       0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
       IID_PPV_ARGS(&CommandList)));
 
-  // Setup input data
-  auto ExpectedOutputBuffer =
-      std::make_unique<float[]>(Config.OutputPerThread * Config.NumThreads);
-
   // Setup input matrix as all-ones in sint8 format. This will later be
   // converted to the appropriate data type by the matrix conversion API.
   CComPtr<ID3D12Resource> InputMatrixSRVResource, InputMatrixSRVUploadResource;
-  std::vector<uint8_t> InputMatrix;
-  if (MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // Matrix source data is fp32, which gets converted to fp16 during matrix
-    // conversion
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
-    return;
-  }
 
   CreateTestResources(D3DDevice, CommandList, InputMatrix.data(),
                       InputMatrix.size(),
@@ -12427,180 +12506,31 @@ void main(uint threadIdx : SV_GroupThreadID)
   // Create input vector of an appropriate type. All integer types start as
   // SINT8 for now.
   CComPtr<ID3D12Resource> InputVecSRVResource, InputVecSRVUploadResource;
-  std::vector<uint8_t> InputVector;
-
-  if ((MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32 &&
-       (MulProps.InputInterpretation ==
-            D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-        MulProps.InputInterpretation ==
-            D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED)) ||
-      MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputVector = CoopVecHelpers::CreateInputVector<int8_t>(
-        Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    InputVector =
-        CoopVecHelpers::CreateInputVector<DirectX::PackedVector::HALF>(
-            Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    InputVector = CoopVecHelpers::CreateInputVector<float>(
-        Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) {
-    InputVector = CoopVecHelpers::CreateInputVector<int32_t>(
-        Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) {
-    InputVector = CoopVecHelpers::CreateInputVector<uint32_t>(
-        Config.NumThreads, Config.InputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported input data type");
-    return;
-  }
-  if (InputVector.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputVector.resize(InputVector.size() + 4 - (InputVector.size() % 4));
-  }
-  CreateTestResources(D3DDevice, CommandList, InputVector.data(),
-                      InputVector.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputVector.size()),
-                      &InputVecSRVResource, &InputVecSRVUploadResource);
+
+  CreateTestResources(
+      D3DDevice, CommandList, InputVector.getBuffer(),
+      InputVector.getTotalBytes(),
+      CD3DX12_RESOURCE_DESC::Buffer(InputVector.getTotalBytes()),
+      &InputVecSRVResource, &InputVecSRVUploadResource);
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector.size() / sizeof(int32_t)),
+               (UINT)(InputVector.getTotalBytes() / sizeof(int32_t)),
                InputVecSRVResource);
 
   // Create input bias
   CComPtr<ID3D12Resource> InputBiasSRVResource, InputBiasSRVUploadResource;
-  std::vector<uint8_t> InputBias;
 
-  if (MulProps.BiasInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-      MulProps.BiasInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
-      MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputBias = CoopVecHelpers::CreateInputBias<int8_t>(Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) {
-    InputBias =
-        CoopVecHelpers::CreateInputBias<int32_t>(Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) {
-    InputBias =
-        CoopVecHelpers::CreateInputBias<uint32_t>(Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
-    InputBias = CoopVecHelpers::CreateInputBias<DirectX::PackedVector::HALF>(
-        Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    InputBias = CoopVecHelpers::CreateInputBias<float>(Config.OutputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported bias data type");
-    return;
-  }
-
-  if (InputBias.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputBias.resize(InputBias.size() + 4 - (InputBias.size() % 4));
-  }
-  CreateTestResources(D3DDevice, CommandList, InputBias.data(),
-                      InputBias.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputBias.size()),
+  CreateTestResources(D3DDevice, CommandList, InputBias.getBuffer(),
+                      InputBias.getTotalBytes(),
+                      CD3DX12_RESOURCE_DESC::Buffer(InputBias.getTotalBytes()),
                       &InputBiasSRVResource, &InputBiasSRVUploadResource);
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputBias.size() / sizeof(int32_t)),
+               (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)),
                InputBiasSRVResource);
 
-  // Calculate reference output
-  // FIXME: This does not capture all cases, but is sufficient for the preview
-  // feature set
-  if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
-    // The input bias is really an array of int32_t
-    std::vector<int32_t> InputBiasI32(InputBias.size() / sizeof(int32_t));
-    std::memcpy(InputBiasI32.data(), InputBias.data(), InputBias.size());
-
-    // The input vector is really an array of float if our vector input type is
-    // FLOAT32
-    std::vector<float> InputVectorF32(InputVector.size() / sizeof(int32_t));
-    if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-      std::memcpy(InputVectorF32.data(), InputVector.data(),
-                  InputVector.size());
-    }
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        int Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          int InputElem;
-          if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-            InputElem = (int)
-                InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx];
-          } else {
-            InputElem =
-                InputVector[ThreadIdx * Config.InputPerThread + InputIdx];
-          }
-          int const MatrixElem =
-              InputMatrix[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += InputBiasI32[OutputIdx];
-        }
-
-        float Result = float(Acc);
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // The input bias/vector is really an array of float16
-    std::vector<DirectX::PackedVector::HALF> InputVectorFP16(
-        InputVector.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputVectorFP16.data(), InputVector.data(), InputVector.size());
-
-    std::vector<DirectX::PackedVector::HALF> InputBiasFP16(
-        InputBias.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputBiasFP16.data(), InputBias.data(), InputBias.size());
-
-    // The CPU reference matrix is float
-    std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
-    std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        float Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          float const InputElem = ConvertFloat16ToFloat32(
-              InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]);
-          float const MatrixElem =
-              InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
-        }
-
-        float Result = Acc;
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
-  }
-
   CComPtr<ID3D12Resource> ConvertedMatrixResource;
   {
     // Create source matrix info
@@ -12862,6 +12792,80 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
   CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle(
       DescriptorHeap->GetCPUDescriptorHandleForHeapStart());
 
+  // Setup input matrix as all-ones in sint8/fp32 format. This will later be
+  // converted to the appropriate data type by the matrix conversion API.
+
+  std::vector<uint8_t> InputMatrix;
+  if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+      AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(Config.DimN,
+                                                                   Config.DimM);
+  } else if (AccumulateProps.AccumulationType ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
+             AccumulateProps.AccumulationType ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
+             AccumulateProps.AccumulationType ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
+    // Matrix source data is fp32, which gets converted to fp16 during matrix
+    // conversion
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN,
+                                                                  Config.DimM);
+  } else {
+    WEX::Logging::Log::Error(L"Unsupported matrix data type");
+    return;
+  }
+
+  // Create input vectors
+  auto InputVector1 = CoopVecHelpers::TestVector::createSimpleTestVector(
+      Config.NumThreads, Config.DimM, AccumulateProps.InputType,
+      AccumulateProps.InputType);
+  auto InputVector2 = CoopVecHelpers::TestVector::createSimpleTestVector(
+      Config.NumThreads, Config.DimN, AccumulateProps.InputType,
+      AccumulateProps.InputType);
+
+  // Calculate reference output
+  auto ExpectedOutputBufferI8 =
+      CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN, Config.DimM);
+  std::vector<float> ExpectedOutputBuffer(ExpectedOutputBufferI8.size() /
+                                          sizeof(float));
+  std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(),
+              ExpectedOutputBufferI8.size());
+
+  if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
+    DirectX::PackedVector::HALF *InputVector1FP16 =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(
+            InputVector1.getBuffer());
+    DirectX::PackedVector::HALF *InputVector2FP16 =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(
+            InputVector2.getBuffer());
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int M = 0; M < Config.DimM; ++M) {
+        for (int N = 0; N < Config.DimN; ++N) {
+          float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) *
+                      ConvertFloat16ToFloat32(InputVector2FP16[N]);
+          ExpectedOutputBuffer[M * Config.DimN + N] += acc;
+        }
+      }
+    }
+  } else if (AccumulateProps.InputType ==
+             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+    float *InputVector1FP32 =
+        reinterpret_cast<float *>(InputVector1.getBuffer());
+    float *InputVector2FP32 =
+        reinterpret_cast<float *>(InputVector2.getBuffer());
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int M = 0; M < Config.DimM; ++M) {
+        for (int N = 0; N < Config.DimN; ++N) {
+          float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] *
+                      InputVector2FP32[ThreadIdx * Config.DimN + N];
+          ExpectedOutputBuffer[M * Config.DimN + N] += Acc;
+        }
+      }
+    }
+  }
+
   // Create a compute pipeline state object.
   CComPtr<ID3D12PipelineState> ComputePipelineState;
   {
@@ -12880,12 +12884,10 @@ void main(uint threadIdx : SV_GroupThreadID)
   using namespace dx::linalg;
 
   // Ensure 4-byte alignment for vector loads
-  uint inputOffset1 = (DIM_M * threadIdx * sizeof(INPUT_DATA_TYPE));
-  inputOffset1 = (inputOffset1 + 3) & ~3; // Align to 4 bytes
+  uint inputOffset1 = threadIdx * INPUT_VECTOR_1_STRIDE;
   vector<INPUT_DATA_TYPE, DIM_M / INPUT_DIVISOR> input1 = InputVector1.Load<vector<INPUT_DATA_TYPE, DIM_M / INPUT_DIVISOR> >(inputOffset1);
 
-  uint inputOffset2 = (DIM_N * threadIdx * sizeof(INPUT_DATA_TYPE));
-  inputOffset2 = (inputOffset2 + 3) & ~3; // Align to 4 bytes
+  uint inputOffset2 = threadIdx * INPUT_VECTOR_2_STRIDE;
   vector<INPUT_DATA_TYPE, DIM_N / INPUT_DIVISOR> input2 = InputVector2.Load<vector<INPUT_DATA_TYPE, DIM_N / INPUT_DIVISOR> >(inputOffset2);
 
   RWMatrixRef<MATRIX_DATA_TYPE_ENUM, DIM_M, DIM_N, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { AccumMatrix, 0, STRIDE };
@@ -12954,6 +12956,10 @@ void main(uint threadIdx : SV_GroupThreadID)
         CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str());
     auto MatrixDataTypeEnumDefine = CreateDefineFromString(
         L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str());
+    auto InputVector1StrideDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride());
+    auto InputVector2StrideDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride());
 
     LPCWSTR Options[] = {
         L"-enable-16bit-types",
@@ -12967,6 +12973,8 @@ void main(uint threadIdx : SV_GroupThreadID)
         InputInterpretationEnumDefine.c_str(),
         HlslMatrixLayoutDefine.c_str(),
         MatrixDataTypeEnumDefine.c_str(),
+        InputVector1StrideDefine.c_str(),
+        InputVector2StrideDefine.c_str(),
     };
 
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
@@ -12991,142 +12999,34 @@ void main(uint threadIdx : SV_GroupThreadID)
       0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
       IID_PPV_ARGS(&CommandList)));
 
-  // Setup input matrix as all-ones in sint8/fp32 format. This will later be
-  // converted to the appropriate data type by the matrix conversion API.
   CComPtr<ID3D12Resource> InputMatrixSRVResource, InputMatrixSRVUploadResource;
-  std::vector<uint8_t> InputMatrix;
-  if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(Config.DimN,
-                                                                   Config.DimM);
-  } else if (AccumulateProps.AccumulationType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             AccumulateProps.AccumulationType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             AccumulateProps.AccumulationType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // Matrix source data is fp32, which gets converted to fp16 during matrix
-    // conversion
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN,
-                                                                  Config.DimM);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
-    return;
-  }
-
   CreateTestResources(D3DDevice, CommandList, InputMatrix.data(),
                       InputMatrix.size(),
                       CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()),
                       &InputMatrixSRVResource, &InputMatrixSRVUploadResource);
 
-  // Create input vectors
   CComPtr<ID3D12Resource> InputVecSRVResource1, InputVecSRVUploadResource1;
-  std::vector<uint8_t> InputVector1;
   CComPtr<ID3D12Resource> InputVecSRVResource2, InputVecSRVUploadResource2;
-  std::vector<uint8_t> InputVector2;
-
-  if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputVector1 = CoopVecHelpers::CreateInputVector<int8_t>(Config.NumThreads,
-                                                             Config.DimM);
-    InputVector2 = CoopVecHelpers::CreateInputVector<int8_t>(Config.NumThreads,
-                                                             Config.DimN);
-  } else if (AccumulateProps.InputType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             AccumulateProps.InputType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             AccumulateProps.InputType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    InputVector1 =
-        CoopVecHelpers::CreateInputVector<DirectX::PackedVector::HALF>(
-            Config.NumThreads, Config.DimM);
-    InputVector2 =
-        CoopVecHelpers::CreateInputVector<DirectX::PackedVector::HALF>(
-            Config.NumThreads, Config.DimN);
-  } else if (AccumulateProps.InputType ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    InputVector1 = CoopVecHelpers::CreateInputVector<float>(Config.NumThreads,
-                                                            Config.DimM);
-    InputVector2 = CoopVecHelpers::CreateInputVector<float>(Config.NumThreads,
-                                                            Config.DimN);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported input data type");
-    return;
-  }
-  if (InputVector1.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputVector1.resize(InputVector1.size() + 4 - (InputVector1.size() % 4));
-  }
-  if (InputVector2.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputVector2.resize(InputVector2.size() + 4 - (InputVector2.size() % 4));
-  }
-  CreateTestResources(D3DDevice, CommandList, InputVector1.data(),
-                      InputVector1.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputVector1.size()),
-                      &InputVecSRVResource1, &InputVecSRVUploadResource1);
-  CreateTestResources(D3DDevice, CommandList, InputVector2.data(),
-                      InputVector2.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputVector2.size()),
-                      &InputVecSRVResource2, &InputVecSRVUploadResource2);
+
+  CreateTestResources(
+      D3DDevice, CommandList, InputVector1.getBuffer(),
+      InputVector1.getTotalBytes(),
+      CD3DX12_RESOURCE_DESC::Buffer(InputVector1.getTotalBytes()),
+      &InputVecSRVResource1, &InputVecSRVUploadResource1);
+  CreateTestResources(
+      D3DDevice, CommandList, InputVector2.getBuffer(),
+      InputVector2.getTotalBytes(),
+      CD3DX12_RESOURCE_DESC::Buffer(InputVector2.getTotalBytes()),
+      &InputVecSRVResource2, &InputVecSRVUploadResource2);
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector1.size() / sizeof(int32_t)),
+               (UINT)(InputVector1.getTotalBytes() / sizeof(int32_t)),
                InputVecSRVResource1);
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector2.size() / sizeof(int32_t)),
+               (UINT)(InputVector2.getTotalBytes() / sizeof(int32_t)),
                InputVecSRVResource2);
 
-  // Calculate reference output
-  auto ExpectedOutputBufferI8 =
-      CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN, Config.DimM);
-  std::vector<float> ExpectedOutputBuffer(ExpectedOutputBufferI8.size() /
-                                          sizeof(float));
-  std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(),
-              ExpectedOutputBufferI8.size());
-
-  if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
-    std::vector<DirectX::PackedVector::HALF> InputVector1FP16(
-        InputVector1.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputVector1FP16.data(), InputVector1.data(),
-                InputVector1.size());
-
-    std::vector<DirectX::PackedVector::HALF> InputVector2FP16(
-        InputVector2.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputVector2FP16.data(), InputVector2.data(),
-                InputVector2.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int M = 0; M < Config.DimM; ++M) {
-        for (int N = 0; N < Config.DimN; ++N) {
-          float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) *
-                      ConvertFloat16ToFloat32(InputVector2FP16[N]);
-          ExpectedOutputBuffer[M * Config.DimN + N] += acc;
-        }
-      }
-    }
-  } else if (AccumulateProps.InputType ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    std::vector<float> InputVector1FP32(InputVector1.size() / sizeof(float));
-    std::memcpy(InputVector1FP32.data(), InputVector1.data(),
-                InputVector1.size());
-
-    std::vector<float> InputVector2FP32(InputVector2.size() / sizeof(float));
-    std::memcpy(InputVector2FP32.data(), InputVector2.data(),
-                InputVector2.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int M = 0; M < Config.DimM; ++M) {
-        for (int N = 0; N < Config.DimN; ++N) {
-          float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] *
-                      InputVector2FP32[ThreadIdx * Config.DimN + N];
-          ExpectedOutputBuffer[M * Config.DimN + N] += Acc;
-        }
-      }
-    }
-  }
-
   CComPtr<ID3D12Resource> ConvertedMatrixResource, ConvertedMatrixReadResource;
   int ConvertedMatrixSize = 0;
   {

From 91b2c7613dc380006e2aead1d8f0a451769bc833 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 6 May 2025 16:17:37 -0400
Subject: [PATCH 02/26] Support odd matrix/vector sizes

---
 .../unittests/HLSLExec/ExecutionTest.cpp      | 84 +++++++++++++------
 1 file changed, 58 insertions(+), 26 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index f47b4624d6..934210af1f 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12149,6 +12149,14 @@ void ExecutionTest::runCoopVecMulTestConfig(
       {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
       {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
@@ -12157,6 +12165,14 @@ void ExecutionTest::runCoopVecMulTestConfig(
       {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
       {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
@@ -12165,6 +12181,14 @@ void ExecutionTest::runCoopVecMulTestConfig(
       {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
        false},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
@@ -12181,6 +12205,22 @@ void ExecutionTest::runCoopVecMulTestConfig(
        false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
        true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
   };
 
   for (auto Config : TestConfigs) {
@@ -12280,18 +12320,15 @@ void ExecutionTest::runCoopVecMulSubtest(
   // FIXME: This does not capture all cases, but is sufficient for the preview
   // feature set
   if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
-    int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer();
-    float *InputVectorF32 = (float *)InputVector.getBuffer();
-
     for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      int32_t *InputBiasI32 = InputBias.getVector<int32_t>(0);
       for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
         int Acc = 0;
 
         for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
           int InputElem;
           if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-            InputElem = (int)
-                InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx];
+            InputElem = (int)InputVector.getVector<float>(ThreadIdx)[InputIdx];
           } else {
             InputElem = InputVector.getVector<int8_t>(ThreadIdx)[InputIdx];
           }
@@ -12315,22 +12352,21 @@ void ExecutionTest::runCoopVecMulSubtest(
                  D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
              MulProps.MatrixInterpretation ==
                  D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    DirectX::PackedVector::HALF *InputVectorFP16 =
-        (DirectX::PackedVector::HALF *)InputVector.getBuffer();
-    DirectX::PackedVector::HALF *InputBiasFP16 =
-        (DirectX::PackedVector::HALF *)InputBias.getBuffer();
-
     // The CPU reference matrix is float
     std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
     std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
 
     for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      DirectX::PackedVector::HALF *InputVectorFP16 =
+          InputVector.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
+      DirectX::PackedVector::HALF *InputBiasFP16 =
+          InputBias.getVector<DirectX::PackedVector::HALF>(0);
       for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
         float Acc = 0;
 
         for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          float const InputElem = ConvertFloat16ToFloat32(
-              InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]);
+          float const InputElem =
+              ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]);
           float const MatrixElem =
               InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
           Acc += InputElem * MatrixElem;
@@ -12365,7 +12401,7 @@ void main(uint threadIdx : SV_GroupThreadID)
   using namespace dx::linalg;
 
   uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE);
-  vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> >(inputOffset);
+  vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> >(inputOffset);
 
   MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix, 0, STRIDE };
 
@@ -12439,8 +12475,9 @@ void main(uint threadIdx : SV_GroupThreadID)
     auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
     auto InputDataTypeDefine =
         CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType);
-    auto InputDivisorDefine =
-        CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor);
+    auto InputDivisorDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_NUM_ELEMENTS",
+        (Config.InputPerThread + InputDivisor - 1) / InputDivisor);
     auto AccumDataTypeDefine =
         CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType);
     auto InputInterpretationEnumDefine = CreateDefineFromString(
@@ -12596,11 +12633,12 @@ void main(uint threadIdx : SV_GroupThreadID)
           &ConvertInfo.DestInfo);
     }
 
+    int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16;
+
     // Create resource to hold matrix copy
-    CreateTestResources(
-        D3DDevice, CommandList, nullptr, 0,
-        CD3DX12_RESOURCE_DESC::Buffer(ConvertInfo.DestInfo.DestSize),
-        &ConvertedMatrixResource, nullptr);
+    CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize,
+                        CD3DX12_RESOURCE_DESC::Buffer(SRVSize),
+                        &ConvertedMatrixResource, nullptr);
 
     // Set up data descriptors
     ConvertInfo.DataDesc.DestVA =
@@ -12613,13 +12651,7 @@ void main(uint threadIdx : SV_GroupThreadID)
         __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11));
     CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
-    // This increments baseHandle
-    if ((ConvertInfo.DestInfo.DestSize % 4) != 0) {
-      WEX::Logging::Log::Error(L"DestSize is not aligned to 4 bytes");
-      return;
-    }
-    CreateRawSRV(D3DDevice, BaseHandle,
-                 ConvertInfo.DestInfo.DestSize / sizeof(int32_t),
+    CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t),
                  ConvertedMatrixResource);
   }
 

From ce38c677a104ae3e5c165dd91b01cf92964f6a01 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 13:10:59 -0400
Subject: [PATCH 03/26] Finish support for NumLayers=2

---
 tools/clang/unittests/HLSLExec/CoopVec.h      | 275 ++++++++++++
 .../unittests/HLSLExec/ExecutionTest.cpp      | 410 +++++++++---------
 2 files changed, 492 insertions(+), 193 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index cd24a556bd..b5c0a2f355 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -448,6 +448,74 @@ struct TestVector {
   uint8_t *getBuffer() { return Buffer; }
   const uint8_t *getBuffer() const { return Buffer; }
 
+  // Copy assignment operator
+  TestVector &operator=(const TestVector &other) {
+    if (this != &other) {
+      // Free existing buffer
+      if (Buffer) {
+#ifdef _MSC_VER
+        _aligned_free(Buffer);
+#else
+        std::free(Buffer);
+#endif
+        Buffer = nullptr;
+      }
+
+      // Copy metadata
+      NumVectors = other.NumVectors;
+      VectorSize = other.VectorSize;
+      ElementSize = other.ElementSize;
+      Stride = other.Stride;
+      TotalBytes = other.TotalBytes;
+
+      // Allocate new buffer
+      void *Ptr = nullptr;
+#ifdef _MSC_VER
+      Ptr = _aligned_malloc(TotalBytes, 16);
+#else
+      Ptr = std::aligned_alloc(16, TotalBytes);
+#endif
+      Buffer = reinterpret_cast<uint8_t *>(Ptr);
+
+      // Copy data
+      if (other.Buffer) {
+        std::memcpy(Buffer, other.Buffer, TotalBytes);
+      }
+    }
+    return *this;
+  }
+
+  // Move assignment operator
+  TestVector &operator=(TestVector &&other) noexcept {
+    if (this != &other) {
+      // Free existing buffer
+      if (Buffer) {
+#ifdef _MSC_VER
+        _aligned_free(Buffer);
+#else
+        std::free(Buffer);
+#endif
+      }
+
+      // Move metadata and buffer
+      NumVectors = other.NumVectors;
+      VectorSize = other.VectorSize;
+      ElementSize = other.ElementSize;
+      Stride = other.Stride;
+      TotalBytes = other.TotalBytes;
+      Buffer = other.Buffer;
+
+      // Reset the source object
+      other.NumVectors = 0;
+      other.VectorSize = 0;
+      other.ElementSize = 0;
+      other.Stride = 0;
+      other.TotalBytes = 0;
+      other.Buffer = nullptr;
+    }
+    return *this;
+  }
+
   template <typename T> T *getVector(size_t I) {
     uint8_t *Ptr = Buffer + I * Stride;
     return reinterpret_cast<T *>(Ptr);
@@ -481,6 +549,20 @@ struct TestVector {
     }
   }
 
+  template <typename T> void fillAllOnesTestData() {
+    // Create a vector of (1, 1, 1, ...)
+    for (size_t I = 0; I < NumVectors; ++I) {
+      T *Vec = getVector<T>(I);
+      for (size_t J = 0; J < VectorSize; ++J)
+        if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
+          // Special case for HALF, which requires conversion from float
+          Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(1.0f));
+        } else {
+          Vec[J] = static_cast<T>(1);
+        }
+    }
+  }
+
   static TestVector
   createSimpleTestVector(size_t NumVectors, size_t VectorSize,
                          D3D12_LINEAR_ALGEBRA_DATATYPE DataType,
@@ -553,6 +635,199 @@ struct TestVector {
     }
     return Vec;
   }
+
+  static TestVector
+  createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize,
+                          D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+    size_t ElementSize;
+    switch (DataInterpretation) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      ElementSize = sizeof(int8_t);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      ElementSize = sizeof(float);
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    TestVector Vec(NumVectors, VectorSize, ElementSize);
+    switch (DataInterpretation) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      Vec.fillAllOnesTestData<int8_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      Vec.fillAllOnesTestData<float>();
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    return Vec;
+  }
+
+  D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO
+  getConversionInfo(ID3D12Device *D3DDevice,
+                    D3D12_LINEAR_ALGEBRA_DATATYPE DestDataType,
+                    D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout) {
+    // Create source matrix info
+    D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {};
+    ConvertInfo.SrcInfo.SrcDataType =
+        ::CoopVecHelpers::GetMatrixSrcDataType(DestDataType);
+    ConvertInfo.SrcInfo.SrcLayout =
+        D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
+
+    // Create destination matrix info
+    ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver
+    int DestEltSize = 0;
+    switch (DestDataType) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED:
+      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8;
+      DestEltSize = 1;
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16;
+      DestEltSize = 2; // FP16
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+      ConvertInfo.DestInfo.DestDataType =
+          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3;
+      DestEltSize = 1; // FP8
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+      ConvertInfo.DestInfo.DestDataType =
+          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2;
+      DestEltSize = 1; // FP8
+      break;
+    }
+    ConvertInfo.SrcInfo.SrcStride = (UINT)getStride();
+    ConvertInfo.SrcInfo.SrcSize = (UINT)getTotalBytes();
+
+    ConvertInfo.DestInfo.DestLayout = MatrixLayout;
+    ConvertInfo.DestInfo.DestStride = 0;
+    ConvertInfo.DestInfo.NumRows = (UINT)getNumVectors();
+    ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize();
+
+    if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
+      ConvertInfo.DestInfo.DestStride = (UINT)getVectorSize() * DestEltSize;
+    } else if (MatrixLayout ==
+               D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
+      ConvertInfo.DestInfo.DestStride = (UINT)getNumVectors() * DestEltSize;
+    }
+
+    // Get destination size using preview interface
+    {
+      CComPtr<ID3D12DevicePreview> PreviewDevice;
+      VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview),
+                                                 (void **)&PreviewDevice));
+
+      // Query required destination size
+      PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo(
+          &ConvertInfo.DestInfo);
+    }
+
+    return ConvertInfo;
+  }
+
+  static TestVector
+  matrixVectorMultiply(const TestVector &Matrix, const TestVector &InputVector,
+                       const TestVector &Bias, bool HasBias,
+                       D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation,
+                       D3D12_LINEAR_ALGEBRA_DATATYPE InputType) {
+    bool IsFP32 = false;
+    switch (MatrixInterpretation) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+      IsFP32 = true;
+      break;
+    default:
+      break;
+    }
+
+    TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(),
+                         sizeof(float));
+
+    if (IsFP32) {
+      for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
+        const DirectX::PackedVector::HALF *InputBiasFP16 =
+            Bias.getVector<DirectX::PackedVector::HALF>(0);
+        for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
+             ++OutputIdx) {
+          float Acc = 0;
+
+          for (int InputIdx = 0; InputIdx < Matrix.getVectorSize();
+               ++InputIdx) {
+            float InputElem;
+            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+              InputElem = InputVector.getVector<float>(VecIdx)[InputIdx];
+            } else {
+              InputElem = ConvertFloat16ToFloat32(
+                  InputVector.getVector<DirectX::PackedVector::HALF>(
+                      VecIdx)[InputIdx]);
+            }
+            float const MatrixElem =
+                Matrix.getVector<float>(OutputIdx)[InputIdx];
+            Acc += InputElem * MatrixElem;
+          }
+
+          if (HasBias) {
+            Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
+          }
+
+          float Result = Acc;
+          ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
+        }
+      }
+    } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
+      for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
+        const int32_t *InputBiasI32 = Bias.getVector<int32_t>(0);
+        for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
+             ++OutputIdx) {
+          int Acc = 0;
+
+          for (int InputIdx = 0; InputIdx < Matrix.getVectorSize();
+               ++InputIdx) {
+            int InputElem;
+            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+              InputElem = (int)InputVector.getVector<float>(VecIdx)[InputIdx];
+            } else {
+              InputElem = InputVector.getVector<int8_t>(VecIdx)[InputIdx];
+            }
+            int const MatrixElem =
+                Matrix.getVector<int8_t>(OutputIdx)[InputIdx];
+            Acc += InputElem * MatrixElem;
+          }
+
+          if (HasBias) {
+            Acc += InputBiasI32[OutputIdx];
+          }
+
+          float Result = float(Acc);
+          ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
+        }
+      }
+    } else {
+      throw std::invalid_argument("Unsupported matrix interpretation");
+    }
+
+    return ResultVec;
+  }
 };
 }; // namespace CoopVecHelpers
 
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 934210af1f..a613f28139 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -789,7 +789,7 @@ class ExecutionTest {
     int InputPerThread;
     int OutputPerThread;
     int NumThreads;
-    int NumLevels;
+    int NumLayers;
     D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout;
     bool Bias;
   };
@@ -12221,6 +12221,88 @@ void ExecutionTest::runCoopVecMulTestConfig(
        false},
       {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
        true},
+
+      // NumLayers=2 tests
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
   };
 
   for (auto Config : TestConfigs) {
@@ -12234,6 +12316,21 @@ void ExecutionTest::runCoopVecMulTestConfig(
       continue;
     }
 
+    if (Config.NumLayers > 1 &&
+        (MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+         MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 ||
+         MulProps.InputInterpretation ==
+             D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+         MulProps.InputInterpretation ==
+             D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) &&
+        (MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 ||
+         MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32)) {
+      // We do not support multi-layer tests with packed types as input with
+      // full-precision integer bias Supporting this in the current framework
+      // would require repacking the accumulator vectors
+      continue;
+    }
+
     bool IsInFilter = CoopVecHelpers::IsMatrixLayoutInFilter(
         L"CoopVecMatrixLayout", Config.MatrixLayout);
     if (!IsInFilter) {
@@ -12250,9 +12347,9 @@ void ExecutionTest::runCoopVecMulSubtest(
 
   LogCommentFmt(
       L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: "
-      L"%d, NumLevels: %d, Bias: %s, MatrixLayout: %s",
+      L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s",
       Config.InputPerThread, Config.OutputPerThread, Config.NumThreads,
-      Config.NumLevels, Config.Bias ? L"true" : L"false",
+      Config.NumLayers, Config.Bias ? L"true" : L"false",
       CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str());
 
   const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4);
@@ -12261,8 +12358,8 @@ void ExecutionTest::runCoopVecMulSubtest(
   CComPtr<ID3D12RootSignature> RootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE Ranges[2];
-    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 3, 0,
-                   0); // InputVector, InputMatrix, InputBias
+    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0,
+                   0); // InputVector, InputBias, InputMatrices[]
     Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer
     CreateRootSignatureFromRanges(D3DDevice, &RootSignature, Ranges, 2, nullptr,
                                   0);
@@ -12273,7 +12370,7 @@ void ExecutionTest::runCoopVecMulSubtest(
   {
     D3D12_DESCRIPTOR_HEAP_DESC Desc = {};
     Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-    Desc.NumDescriptors = 4;
+    Desc.NumDescriptors = 3 + Config.NumLayers;
     Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
     VERIFY_SUCCEEDED(
         D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap)));
@@ -12281,106 +12378,35 @@ void ExecutionTest::runCoopVecMulSubtest(
   CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle(
       DescriptorHeap->GetCPUDescriptorHandleForHeapStart());
 
-  // Setup input data
-  auto ExpectedOutputBuffer =
-      std::make_unique<float[]>(Config.OutputPerThread * Config.NumThreads);
-
-  std::vector<uint8_t> InputMatrix;
-  if (MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // Matrix source data is fp32, which gets converted to fp16 during matrix
-    // conversion
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
-    return;
-  }
+  // Our input matrix is really a set of row vectors, which we can represent
+  // as a TestVector.
+  std::vector<::CoopVecHelpers::TestVector> InputMatrices;
+  for (int I = 0; I < Config.NumLayers - 1; ++I) {
+    // Each layer except the last is InputPerThread x InputPerThread
+    InputMatrices.push_back(
+        ::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
+            Config.InputPerThread, Config.InputPerThread,
+            MulProps.MatrixInterpretation));
+  }
+  // Last layer, matrix size is OutputPerThread x InputPerThread
+  InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
+      Config.OutputPerThread, Config.InputPerThread,
+      MulProps.MatrixInterpretation));
 
   auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector(
       Config.NumThreads, Config.InputPerThread, MulProps.InputType,
       MulProps.InputInterpretation);
   auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector(
-      1, Config.OutputPerThread, MulProps.BiasInterpretation,
-      MulProps.BiasInterpretation);
+      1, std::max(Config.OutputPerThread, Config.InputPerThread),
+      MulProps.BiasInterpretation, MulProps.BiasInterpretation);
 
   // Calculate reference output
-  // FIXME: This does not capture all cases, but is sufficient for the preview
-  // feature set
-  if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      int32_t *InputBiasI32 = InputBias.getVector<int32_t>(0);
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        int Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          int InputElem;
-          if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-            InputElem = (int)InputVector.getVector<float>(ThreadIdx)[InputIdx];
-          } else {
-            InputElem = InputVector.getVector<int8_t>(ThreadIdx)[InputIdx];
-          }
-          int const MatrixElem =
-              InputMatrix[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += InputBiasI32[OutputIdx];
-        }
-
-        float Result = float(Acc);
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // The CPU reference matrix is float
-    std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
-    std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      DirectX::PackedVector::HALF *InputVectorFP16 =
-          InputVector.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
-      DirectX::PackedVector::HALF *InputBiasFP16 =
-          InputBias.getVector<DirectX::PackedVector::HALF>(0);
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        float Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          float const InputElem =
-              ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]);
-          float const MatrixElem =
-              InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
-        }
-
-        float Result = Acc;
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
+  auto ExpectedOutput = InputVector;
+  for (int I = 0; I < Config.NumLayers; ++I) {
+    ExpectedOutput = ::CoopVecHelpers::TestVector::matrixVectorMultiply(
+        InputMatrices[I], ExpectedOutput, InputBias, Config.Bias,
+        MulProps.MatrixInterpretation,
+        I == 0 ? MulProps.InputType : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32);
   }
 
   // Create the compute pipeline state for the CoopVec shader
@@ -12391,7 +12417,7 @@ void ExecutionTest::runCoopVecMulSubtest(
 
 ByteAddressBuffer InputVector : register(t0);
 ByteAddressBuffer InputBias : register(t1);
-ByteAddressBuffer InputMatrix : register(t2);
+ByteAddressBuffer InputMatrix[NUM_LAYERS] : register(t2);
 RWByteAddressBuffer OutputBuffer: register(u0);
 
 [shader("compute")]
@@ -12402,25 +12428,57 @@ void main(uint threadIdx : SV_GroupThreadID)
 
   uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE);
   vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> >(inputOffset);
+  VectorRef<BIAS_INTERPRETATION_ENUM> biasVec = { InputBias, 0 };
+
+  vector<ACCUM_DATA_TYPE, OUTPUT_PER_THREAD> output;
+)";
+
+    if (Config.NumLayers == 1) {
+      ShaderSource += R"(
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix[0], 0, STRIDE };
+
+  if (USE_BIAS) {
+    output = MulAdd<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
+  } else {
+    output = Mul<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+  }
+)";
+    } else if (Config.NumLayers == 2) {
+      ShaderSource += R"(
+  vector<ACCUM_DATA_TYPE, INPUT_PER_THREAD> accum;
 
-  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix, 0, STRIDE };
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, INPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat0 = { InputMatrix[0], 0, STRIDE };
+  if (USE_BIAS) {
+    accum = MulAdd<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
+    //accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+  } else {
+    accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+  }
 
-  vector<ACCUM_DATA_TYPE, OUTPUT_PER_THREAD> accum;
+  // Dummy activation function; all of our intermediates are positive (currently).
+  accum = max(accum, 0);
 
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat1 = { InputMatrix[1], 0, STRIDE };
   if (USE_BIAS) {
-    VectorRef<ACCUM_INTERPRETATION_ENUM> biasVec = { InputBias, 0 };
-    accum = MulAdd<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
+    output = MulAdd<ACCUM_DATA_TYPE>(mat1, MakeInterpretedVector<ACCUM_INTERPRETATION_ENUM>(accum), biasVec);
   } else {
-    accum = Mul<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+    output = Mul<ACCUM_DATA_TYPE>(mat1, MakeInterpretedVector<ACCUM_INTERPRETATION_ENUM>(accum));
   }
+)";
+    }
 
-  vector<float, OUTPUT_PER_THREAD> result = (vector<float, OUTPUT_PER_THREAD>)accum;
+    ShaderSource += R"(
+  vector<float, OUTPUT_PER_THREAD> result = (vector<float, OUTPUT_PER_THREAD>)output;
 
   // Ensure 4-byte alignment for vector store
   uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float);
   OutputBuffer.Store<vector<float, OUTPUT_PER_THREAD> >(outputOffset, result);
 }
-    )";
+)";
+
+#if 0
+    printf("%s\n", ShaderSource.c_str());
+#endif
 
     auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
       std::wstringstream Stream;
@@ -12462,7 +12520,7 @@ void main(uint threadIdx : SV_GroupThreadID)
     const std::wstring InputInterpretationEnum =
         CoopVecHelpers::GetHlslInterpretationForDataType(
             MulProps.InputInterpretation);
-    const std::wstring AccumInterpretationEnum =
+    const std::wstring BiasInterpretationEnum =
         CoopVecHelpers::GetHlslInterpretationForDataType(
             MulProps.BiasInterpretation);
 
@@ -12487,10 +12545,15 @@ void main(uint threadIdx : SV_GroupThreadID)
     auto MatrixDataTypeEnumDefine =
         CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum);
     auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0);
+    // Treat the accumulator interpretation the same as the input interpretation
+    // for the purposes of MakeInterpretedVector.
     auto AccumInterpretationEnumDefine = CreateDefineFromString(
-        L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum);
+        L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum);
     auto InputVectorStrideDefine = CreateDefineFromInt(
         L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride());
+    auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers);
+    auto BiasInterpretationEnumDefine = CreateDefineFromString(
+        L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum);
 
     LPCWSTR Options[] = {
         L"-enable-16bit-types",
@@ -12507,8 +12570,18 @@ void main(uint threadIdx : SV_GroupThreadID)
         UseBiasDefine.c_str(),
         AccumInterpretationEnumDefine.c_str(),
         InputVectorStrideDefine.c_str(),
+        NumLayersDefine.c_str(),
+        BiasInterpretationEnumDefine.c_str(),
     };
 
+#if 0
+    // Print options for debugging
+    WEX::Logging::Log::Comment(L"Shader compilation options:");
+    for (UINT i = 0; i < _countof(Options); i++) {
+      WEX::Logging::Log::Comment(Options[i]);
+    }
+#endif
+
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
         new LinAlgHeaderIncludeHandler(m_support);
 
@@ -12531,14 +12604,17 @@ void main(uint threadIdx : SV_GroupThreadID)
       0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
       IID_PPV_ARGS(&CommandList)));
 
-  // Setup input matrix as all-ones in sint8 format. This will later be
-  // converted to the appropriate data type by the matrix conversion API.
-  CComPtr<ID3D12Resource> InputMatrixSRVResource, InputMatrixSRVUploadResource;
-
-  CreateTestResources(D3DDevice, CommandList, InputMatrix.data(),
-                      InputMatrix.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()),
-                      &InputMatrixSRVResource, &InputMatrixSRVUploadResource);
+  std::vector<CComPtr<ID3D12Resource>> InputMatrixSRVResources(
+      Config.NumLayers);
+  std::vector<CComPtr<ID3D12Resource>> InputMatrixSRVUploadResources(
+      Config.NumLayers);
+  for (int I = 0; I < Config.NumLayers; ++I) {
+    CreateTestResources(
+        D3DDevice, CommandList, InputMatrices[I].getBuffer(),
+        InputMatrices[I].getTotalBytes(),
+        CD3DX12_RESOURCE_DESC::Buffer(InputMatrices[I].getTotalBytes()),
+        &InputMatrixSRVResources[I], &InputMatrixSRVUploadResources[I]);
+  }
 
   // Create input vector of an appropriate type. All integer types start as
   // SINT8 for now.
@@ -12568,82 +12644,25 @@ void main(uint threadIdx : SV_GroupThreadID)
                (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)),
                InputBiasSRVResource);
 
-  CComPtr<ID3D12Resource> ConvertedMatrixResource;
-  {
-    // Create source matrix info
-    D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {};
-    ConvertInfo.SrcInfo.SrcDataType =
-        CoopVecHelpers::GetMatrixSrcDataType(MulProps.MatrixInterpretation);
-    ConvertInfo.SrcInfo.SrcLayout =
-        D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
-
-    // Create destination matrix info
-    ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver
-    int SrcEltSize = 0;
-    int DestEltSize = 0;
-    switch (MulProps.MatrixInterpretation) {
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED:
-      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8;
-      SrcEltSize = 1;
-      DestEltSize = 1;
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
-      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16;
-      SrcEltSize = 4;  // FP32
-      DestEltSize = 2; // FP16
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
-      ConvertInfo.DestInfo.DestDataType =
-          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3;
-      SrcEltSize = 4;  // FP32
-      DestEltSize = 1; // FP8
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
-      ConvertInfo.DestInfo.DestDataType =
-          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2;
-      SrcEltSize = 4;  // FP32
-      DestEltSize = 1; // FP8
-      break;
-    }
-    ConvertInfo.SrcInfo.SrcStride = Config.InputPerThread * SrcEltSize;
-    ConvertInfo.SrcInfo.SrcSize =
-        Config.InputPerThread * Config.OutputPerThread * SrcEltSize;
-
-    ConvertInfo.DestInfo.DestLayout = Config.MatrixLayout;
-    ConvertInfo.DestInfo.DestStride = 0;
-    ConvertInfo.DestInfo.NumRows = Config.OutputPerThread;
-    ConvertInfo.DestInfo.NumColumns = Config.InputPerThread;
-
-    if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
-      ConvertInfo.DestInfo.DestStride = Config.InputPerThread * DestEltSize;
-    } else if (Config.MatrixLayout ==
-               D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
-      ConvertInfo.DestInfo.DestStride = Config.OutputPerThread * DestEltSize;
-    }
-
-    // Get destination size using preview interface
-    {
-      CComPtr<ID3D12DevicePreview> PreviewDevice;
-      VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview),
-                                                 (void **)&PreviewDevice));
-
-      // Query required destination size
-      PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo(
-          &ConvertInfo.DestInfo);
-    }
+  // Create converted matrix resource and SRV for each input matrix
+  std::vector<CComPtr<ID3D12Resource>> ConvertedMatrixResources(
+      Config.NumLayers);
+  for (int I = 0; I < Config.NumLayers; ++I) {
+    auto ConvertInfo = InputMatrices[I].getConversionInfo(
+        D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout);
 
     int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16;
 
     // Create resource to hold matrix copy
     CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize,
                         CD3DX12_RESOURCE_DESC::Buffer(SRVSize),
-                        &ConvertedMatrixResource, nullptr);
+                        &ConvertedMatrixResources[I], nullptr);
 
     // Set up data descriptors
     ConvertInfo.DataDesc.DestVA =
-        ConvertedMatrixResource->GetGPUVirtualAddress();
-    ConvertInfo.DataDesc.SrcVA = InputMatrixSRVResource->GetGPUVirtualAddress();
+        ConvertedMatrixResources[I]->GetGPUVirtualAddress();
+    ConvertInfo.DataDesc.SrcVA =
+        InputMatrixSRVResources[I]->GetGPUVirtualAddress();
 
     // Get command list interface and perform conversion
     CComPtr<ID3D12GraphicsCommandList11> CommandList11;
@@ -12651,8 +12670,9 @@ void main(uint threadIdx : SV_GroupThreadID)
         __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11));
     CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
+    // This increments BaseHandle
     CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t),
-                 ConvertedMatrixResource);
+                 ConvertedMatrixResources[I]);
   }
 
   CComPtr<ID3D12Resource> UavResource;
@@ -12697,14 +12717,18 @@ void main(uint threadIdx : SV_GroupThreadID)
 
     float *ResultBuffer = (float *)MappedData.data();
     bool Equal = true;
-    for (int i = 0; i < OutputBufferSize / sizeof(float); i++) {
-      if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) ||
-          fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) {
-        LogErrorFmt(L"Result mismatch at index %d", i);
-        LogErrorFmt(L"ResultBuffer[%d]: %f, ExpectedOutputBuffer[%d]: %f", i,
-                    ResultBuffer[i], i, ExpectedOutputBuffer[i]);
-        Equal = false;
-        break;
+
+    for (int i = 0; i < Config.NumThreads; ++i) {
+      for (int j = 0; j < Config.OutputPerThread; ++j) {
+        float Result = ResultBuffer[i * Config.OutputPerThread + j];
+        float Expected = ExpectedOutput.getVector<float>(i)[j];
+        if (isnan(Result) || isnan(Expected) ||
+            fabs(Result - Expected) > 0.00001) {
+          LogErrorFmt(L"Result mismatch at index %d",
+                      i * Config.OutputPerThread + j);
+          LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected);
+          Equal = false;
+        }
       }
     }
     VERIFY_IS_TRUE(Equal);

From 03cf74d28a55b3472d2f20295f75c23ee54bcb8d Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 13:15:01 -0400
Subject: [PATCH 04/26] Remove dead code

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index a613f28139..3d69815034 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12476,10 +12476,6 @@ void main(uint threadIdx : SV_GroupThreadID)
 }
 )";
 
-#if 0
-    printf("%s\n", ShaderSource.c_str());
-#endif
-
     auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
       std::wstringstream Stream;
       Stream << L"-D" << Name << L"=" << Value;
@@ -12574,14 +12570,6 @@ void main(uint threadIdx : SV_GroupThreadID)
         BiasInterpretationEnumDefine.c_str(),
     };
 
-#if 0
-    // Print options for debugging
-    WEX::Logging::Log::Comment(L"Shader compilation options:");
-    for (UINT i = 0; i < _countof(Options); i++) {
-      WEX::Logging::Log::Comment(Options[i]);
-    }
-#endif
-
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
         new LinAlgHeaderIncludeHandler(m_support);
 

From 68069f0209097fa80ac370636387a8e13e61c973 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Thu, 8 May 2025 07:25:28 -0400
Subject: [PATCH 05/26] Remove dead line

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 3d69815034..ef769b12f7 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12450,7 +12450,6 @@ void main(uint threadIdx : SV_GroupThreadID)
   MatrixRef<MATRIX_DATA_TYPE_ENUM, INPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat0 = { InputMatrix[0], 0, STRIDE };
   if (USE_BIAS) {
     accum = MulAdd<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
-    //accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
   } else {
     accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
   }

From 33bcadf518fa3e22bb0ad1aa66bd787cdbf4daf5 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Thu, 8 May 2025 07:27:07 -0400
Subject: [PATCH 06/26] Add comment about ambiguous IsFP32 flag

---
 tools/clang/unittests/HLSLExec/CoopVec.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index b5c0a2f355..18b8669197 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -749,12 +749,13 @@ struct TestVector {
                        const TestVector &Bias, bool HasBias,
                        D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation,
                        D3D12_LINEAR_ALGEBRA_DATATYPE InputType) {
-    bool IsFP32 = false;
+    // The CPU reference matrix is FP32 for all FP interpretations.
+    bool IsMatrixFP32 = false;
     switch (MatrixInterpretation) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
-      IsFP32 = true;
+      IsMatrixFP32 = true;
       break;
     default:
       break;
@@ -763,7 +764,7 @@ struct TestVector {
     TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(),
                          sizeof(float));
 
-    if (IsFP32) {
+    if (IsMatrixFP32) {
       for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
         const DirectX::PackedVector::HALF *InputBiasFP16 =
             Bias.getVector<DirectX::PackedVector::HALF>(0);

From 70d642e2e5806de791ea66e017eb31d3b87e62bd Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 15:45:54 -0400
Subject: [PATCH 07/26] Initial support for CoopVec pixel shader tests

---
 .../unittests/HLSLExec/ExecutionTest.cpp      | 342 ++++++++++++------
 1 file changed, 228 insertions(+), 114 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index ef769b12f7..51206893e9 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -799,7 +799,7 @@ class ExecutionTest {
                           D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps);
   void runCoopVecMulSubtest(ID3D12Device *D3DDevice,
                             D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps,
-                            CoopVecMulSubtestConfig &Config);
+                            CoopVecMulSubtestConfig &Config, bool RunCompute);
 
   struct CoopVecOuterProductSubtestConfig {
     int DimM; // Row Count
@@ -815,6 +815,7 @@ class ExecutionTest {
       ID3D12Device *D3DDevice,
       D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps,
       CoopVecOuterProductSubtestConfig &Config);
+
 #endif // HAVE_COOPVEC_API
 
   template <class T1, class T2>
@@ -12337,13 +12338,15 @@ void ExecutionTest::runCoopVecMulTestConfig(
       continue;
     }
 
-    runCoopVecMulSubtest(D3DDevice, MulProps, Config);
+    // Run once as compute, then again as graphics (pixel shader)
+    runCoopVecMulSubtest(D3DDevice, MulProps, Config, true);
+    runCoopVecMulSubtest(D3DDevice, MulProps, Config, false);
   }
 }
 
 void ExecutionTest::runCoopVecMulSubtest(
     ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps,
-    CoopVecMulSubtestConfig &Config) {
+    CoopVecMulSubtestConfig &Config, bool RunCompute) {
 
   LogCommentFmt(
       L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: "
@@ -12361,8 +12364,17 @@ void ExecutionTest::runCoopVecMulSubtest(
     Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0,
                    0); // InputVector, InputBias, InputMatrices[]
     Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer
-    CreateRootSignatureFromRanges(D3DDevice, &RootSignature, Ranges, 2, nullptr,
-                                  0);
+
+    CD3DX12_ROOT_PARAMETER RootParams[2];
+    RootParams[0].InitAsDescriptorTable(_countof(Ranges), Ranges,
+                                        D3D12_SHADER_VISIBILITY_ALL);
+    RootParams[1].InitAsUnorderedAccessView(/* register */ 10, /* space */ 0,
+                                            D3D12_SHADER_VISIBILITY_ALL);
+
+    CD3DX12_ROOT_SIGNATURE_DESC RootSignatureDesc;
+    RootSignatureDesc.Init(_countof(RootParams), RootParams, 0, nullptr,
+                           D3D12_ROOT_SIGNATURE_FLAG_NONE);
+    CreateRootSignatureFromDesc(D3DDevice, &RootSignatureDesc, &RootSignature);
   }
 
   // Create descriptor heap with space for 4 descriptors: 3 SRVs and 1 UAV
@@ -12411,8 +12423,8 @@ void ExecutionTest::runCoopVecMulSubtest(
 
   // Create the compute pipeline state for the CoopVec shader
   CComPtr<ID3D12PipelineState> ComputePipelineState;
-  {
-    std::string ShaderSource = R"(
+
+  std::string ShaderSource = R"(
 #include "dx/linalg.h"
 
 ByteAddressBuffer InputVector : register(t0);
@@ -12420,9 +12432,9 @@ ByteAddressBuffer InputBias : register(t1);
 ByteAddressBuffer InputMatrix[NUM_LAYERS] : register(t2);
 RWByteAddressBuffer OutputBuffer: register(u0);
 
-[shader("compute")]
-[numthreads(NUM_THREADS, 1, 1)]
-void main(uint threadIdx : SV_GroupThreadID)
+RWStructuredBuffer<uint> AtomicCounter : register(u10);
+
+void RunCoopVecTest(uint threadIdx)
 {
   using namespace dx::linalg;
 
@@ -12433,8 +12445,8 @@ void main(uint threadIdx : SV_GroupThreadID)
   vector<ACCUM_DATA_TYPE, OUTPUT_PER_THREAD> output;
 )";
 
-    if (Config.NumLayers == 1) {
-      ShaderSource += R"(
+  if (Config.NumLayers == 1) {
+    ShaderSource += R"(
   MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix[0], 0, STRIDE };
 
   if (USE_BIAS) {
@@ -12443,8 +12455,8 @@ void main(uint threadIdx : SV_GroupThreadID)
     output = Mul<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
   }
 )";
-    } else if (Config.NumLayers == 2) {
-      ShaderSource += R"(
+  } else if (Config.NumLayers == 2) {
+    ShaderSource += R"(
   vector<ACCUM_DATA_TYPE, INPUT_PER_THREAD> accum;
 
   MatrixRef<MATRIX_DATA_TYPE_ENUM, INPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat0 = { InputMatrix[0], 0, STRIDE };
@@ -12464,117 +12476,168 @@ void main(uint threadIdx : SV_GroupThreadID)
     output = Mul<ACCUM_DATA_TYPE>(mat1, MakeInterpretedVector<ACCUM_INTERPRETATION_ENUM>(accum));
   }
 )";
-    }
+  }
 
-    ShaderSource += R"(
+  ShaderSource += R"(
   vector<float, OUTPUT_PER_THREAD> result = (vector<float, OUTPUT_PER_THREAD>)output;
 
   // Ensure 4-byte alignment for vector store
   uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float);
   OutputBuffer.Store<vector<float, OUTPUT_PER_THREAD> >(outputOffset, result);
 }
-)";
 
-    auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
-      std::wstringstream Stream;
-      Stream << L"-D" << Name << L"=" << Value;
-      return Stream.str();
-    };
+[shader("compute")]
+[numthreads(NUM_THREADS, 1, 1)]
+void main(uint threadIdx : SV_GroupThreadID)
+{
+  RunCoopVecTest(threadIdx);
+}
 
-    auto CreateDefineFromString = [](const wchar_t *Name,
-                                     const std::wstring &Value) {
-      std::wstringstream Stream;
-      Stream << L"-D" << Name << L"=" << Value;
-      return Stream.str();
-    };
+float4 vs_main(uint vid : SV_VertexID) : SV_Position {
+  switch (vid) {
+  case 0:
+    return float4(-1, 1, 0, 0);
+  case 1:
+    return float4(3, 1, 0, 0);
+  case 2:
+    return float4(-1, -3, 0, 0);
+  }
+  return float4(0, 0, 0, 0);
+}
 
-    int Stride = 0;
-    const std::wstring HlslMatrixLayout =
-        CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout);
-    int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType(
-        MulProps.MatrixInterpretation);
-    switch (Config.MatrixLayout) {
-    case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR:
-      Stride = Config.InputPerThread * StrideMultiplier;
-      break;
-    case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR:
-      Stride = Config.OutputPerThread * StrideMultiplier;
-      break;
-    }
+float4 ps_main() : SV_Target {
+  uint threadIdx;
+  InterlockedAdd(AtomicCounter[0], 1, threadIdx);
+  RunCoopVecTest(threadIdx);
+  return float4(1, 1, 1, 1);
+}
+)";
 
-    const int InputDivisor =
-        CoopVecHelpers::GetNumPackedElementsForInputDataType(
-            MulProps.InputInterpretation);
-    const std::wstring InputDataType =
-        CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.InputType);
-    const std::wstring AccumDataType =
-        CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.BiasInterpretation);
-    const std::wstring MatrixDataTypeEnum =
-        CoopVecHelpers::GetHlslInterpretationForDataType(
-            MulProps.MatrixInterpretation);
-    const std::wstring InputInterpretationEnum =
-        CoopVecHelpers::GetHlslInterpretationForDataType(
-            MulProps.InputInterpretation);
-    const std::wstring BiasInterpretationEnum =
-        CoopVecHelpers::GetHlslInterpretationForDataType(
-            MulProps.BiasInterpretation);
+  auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
+    std::wstringstream Stream;
+    Stream << L"-D" << Name << L"=" << Value;
+    return Stream.str();
+  };
 
-    auto InputPerThreadDefine =
-        CreateDefineFromInt(L"INPUT_PER_THREAD", Config.InputPerThread);
-    auto OutputPerThreadDefine =
-        CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread);
-    auto NumThreadsDefine =
-        CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads);
-    auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
-    auto InputDataTypeDefine =
-        CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType);
-    auto InputDivisorDefine = CreateDefineFromInt(
-        L"INPUT_VECTOR_NUM_ELEMENTS",
-        (Config.InputPerThread + InputDivisor - 1) / InputDivisor);
-    auto AccumDataTypeDefine =
-        CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType);
-    auto InputInterpretationEnumDefine = CreateDefineFromString(
-        L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum);
-    auto HlslMatrixLayoutDefine =
-        CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout);
-    auto MatrixDataTypeEnumDefine =
-        CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum);
-    auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0);
-    // Treat the accumulator interpretation the same as the input interpretation
-    // for the purposes of MakeInterpretedVector.
-    auto AccumInterpretationEnumDefine = CreateDefineFromString(
-        L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum);
-    auto InputVectorStrideDefine = CreateDefineFromInt(
-        L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride());
-    auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers);
-    auto BiasInterpretationEnumDefine = CreateDefineFromString(
-        L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum);
+  auto CreateDefineFromString = [](const wchar_t *Name,
+                                   const std::wstring &Value) {
+    std::wstringstream Stream;
+    Stream << L"-D" << Name << L"=" << Value;
+    return Stream.str();
+  };
 
-    LPCWSTR Options[] = {
-        L"-enable-16bit-types",
-        InputPerThreadDefine.c_str(),
-        OutputPerThreadDefine.c_str(),
-        NumThreadsDefine.c_str(),
-        StrideDefine.c_str(),
-        InputDataTypeDefine.c_str(),
-        InputDivisorDefine.c_str(),
-        AccumDataTypeDefine.c_str(),
-        InputInterpretationEnumDefine.c_str(),
-        HlslMatrixLayoutDefine.c_str(),
-        MatrixDataTypeEnumDefine.c_str(),
-        UseBiasDefine.c_str(),
-        AccumInterpretationEnumDefine.c_str(),
-        InputVectorStrideDefine.c_str(),
-        NumLayersDefine.c_str(),
-        BiasInterpretationEnumDefine.c_str(),
-    };
+  int Stride = 0;
+  const std::wstring HlslMatrixLayout =
+      CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout);
+  int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType(
+      MulProps.MatrixInterpretation);
+  switch (Config.MatrixLayout) {
+  case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR:
+    Stride = Config.InputPerThread * StrideMultiplier;
+    break;
+  case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR:
+    Stride = Config.OutputPerThread * StrideMultiplier;
+    break;
+  }
 
-    CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
-        new LinAlgHeaderIncludeHandler(m_support);
+  const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType(
+      MulProps.InputInterpretation);
+  const std::wstring InputDataType =
+      CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.InputType);
+  const std::wstring AccumDataType =
+      CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.BiasInterpretation);
+  const std::wstring MatrixDataTypeEnum =
+      CoopVecHelpers::GetHlslInterpretationForDataType(
+          MulProps.MatrixInterpretation);
+  const std::wstring InputInterpretationEnum =
+      CoopVecHelpers::GetHlslInterpretationForDataType(
+          MulProps.InputInterpretation);
+  const std::wstring BiasInterpretationEnum =
+      CoopVecHelpers::GetHlslInterpretationForDataType(
+          MulProps.BiasInterpretation);
+
+  auto InputPerThreadDefine =
+      CreateDefineFromInt(L"INPUT_PER_THREAD", Config.InputPerThread);
+  auto OutputPerThreadDefine =
+      CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread);
+  auto NumThreadsDefine =
+      CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads);
+  auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
+  auto InputDataTypeDefine =
+      CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType);
+  auto InputDivisorDefine = CreateDefineFromInt(
+      L"INPUT_VECTOR_NUM_ELEMENTS",
+      (Config.InputPerThread + InputDivisor - 1) / InputDivisor);
+  auto AccumDataTypeDefine =
+      CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType);
+  auto InputInterpretationEnumDefine = CreateDefineFromString(
+      L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum);
+  auto HlslMatrixLayoutDefine =
+      CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout);
+  auto MatrixDataTypeEnumDefine =
+      CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum);
+  auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0);
+  // Treat the accumulator interpretation the same as the input interpretation
+  // for the purposes of MakeInterpretedVector.
+  auto AccumInterpretationEnumDefine = CreateDefineFromString(
+      L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum);
+  auto InputVectorStrideDefine =
+      CreateDefineFromInt(L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride());
+  auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers);
+  auto BiasInterpretationEnumDefine = CreateDefineFromString(
+      L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum);
+
+  LPCWSTR Options[] = {
+      L"-enable-16bit-types",
+      InputPerThreadDefine.c_str(),
+      OutputPerThreadDefine.c_str(),
+      NumThreadsDefine.c_str(),
+      StrideDefine.c_str(),
+      InputDataTypeDefine.c_str(),
+      InputDivisorDefine.c_str(),
+      AccumDataTypeDefine.c_str(),
+      InputInterpretationEnumDefine.c_str(),
+      HlslMatrixLayoutDefine.c_str(),
+      MatrixDataTypeEnumDefine.c_str(),
+      UseBiasDefine.c_str(),
+      AccumInterpretationEnumDefine.c_str(),
+      InputVectorStrideDefine.c_str(),
+      NumLayersDefine.c_str(),
+      BiasInterpretationEnumDefine.c_str(),
+  };
 
+  CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
+      new LinAlgHeaderIncludeHandler(m_support);
+
+  if (RunCompute) {
     CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9",
                      &ComputePipelineState, Options, _countof(Options),
                      IncludeHandler);
+  } else {
+    CComPtr<ID3DBlob> VertexShader;
+    CComPtr<ID3DBlob> PixelShader;
+
+    CompileFromText(ShaderSource.c_str(), L"vs_main", L"vs_6_9", &VertexShader,
+                    Options, _countof(Options), IncludeHandler);
+    CompileFromText(ShaderSource.c_str(), L"ps_main", L"ps_6_9", &PixelShader,
+                    Options, _countof(Options), IncludeHandler);
+
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {};
+    // psoDesc.InputLayout;
+    PsoDesc.pRootSignature = RootSignature;
+    PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader);
+    PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader);
+    PsoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
+    PsoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
+    PsoDesc.DepthStencilState.DepthEnable = FALSE;
+    PsoDesc.DepthStencilState.StencilEnable = FALSE;
+    PsoDesc.SampleMask = UINT_MAX;
+    PsoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+    PsoDesc.NumRenderTargets = 1;
+    PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
+    PsoDesc.SampleDesc.Count = 1;
+    VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState(
+        &PsoDesc, IID_PPV_ARGS(&ComputePipelineState)));
   }
 
   // Create a command list for the compute shader.
@@ -12662,6 +12725,14 @@ void main(uint threadIdx : SV_GroupThreadID)
                  ConvertedMatrixResources[I]);
   }
 
+  // Create resource for atomic counter
+  CComPtr<ID3D12Resource> AtomicCounterResource;
+  uint32_t AtomicCounterInit = 0;
+  CreateTestResources(D3DDevice, CommandList, &AtomicCounterInit,
+                      sizeof(AtomicCounterInit),
+                      CD3DX12_RESOURCE_DESC::Buffer(sizeof(AtomicCounterInit)),
+                      &AtomicCounterResource, nullptr);
+
   CComPtr<ID3D12Resource> UavResource;
   CComPtr<ID3D12Resource> UavUploadResource;
   CComPtr<ID3D12Resource> UavReadResource;
@@ -12687,10 +12758,54 @@ void main(uint threadIdx : SV_GroupThreadID)
   CD3DX12_GPU_DESCRIPTOR_HANDLE ResHandle(
       DescriptorHeap->GetGPUDescriptorHandleForHeapStart());
 
-  CommandList->SetComputeRootSignature(RootSignature);
-  CommandList->SetComputeRootDescriptorTable(0, ResHandle);
-  CommandList->SetPipelineState(ComputePipelineState);
-  CommandList->Dispatch(1, 1, 1);
+  CComPtr<ID3D12DescriptorHeap> RtvHeap;
+  CComPtr<ID3D12Resource> RenderTarget;
+  CComPtr<ID3D12Resource> RenderTargetRead;
+
+  if (RunCompute) {
+    CommandList->SetComputeRootSignature(RootSignature);
+    CommandList->SetComputeRootDescriptorTable(0, ResHandle);
+    CommandList->SetPipelineState(ComputePipelineState);
+    CommandList->Dispatch(1, 1, 1);
+  } else {
+    UINT FrameCount = 1;
+    UINT RtvDescSize = 0;
+    CreateRtvDescriptorHeap(D3DDevice, FrameCount, &RtvHeap, &RtvDescSize);
+    CreateRenderTargetAndReadback(D3DDevice, RtvHeap, 100, 100, &RenderTarget,
+                                  &RenderTargetRead);
+
+    D3D12_RESOURCE_DESC RtDesc = RenderTarget->GetDesc();
+    D3D12_VIEWPORT Viewport;
+    D3D12_RECT ScissorRect;
+
+    memset(&Viewport, 0, sizeof(Viewport));
+    Viewport.Height = (float)RtDesc.Height;
+    Viewport.Width = (float)RtDesc.Width;
+    Viewport.MaxDepth = 1.0f;
+    memset(&ScissorRect, 0, sizeof(ScissorRect));
+    ScissorRect.right = (long)RtDesc.Width;
+    ScissorRect.bottom = RtDesc.Height;
+    CommandList->SetGraphicsRootSignature(RootSignature);
+    CommandList->SetGraphicsRootDescriptorTable(0, ResHandle);
+    CommandList->SetGraphicsRootUnorderedAccessView(
+        1, AtomicCounterResource->GetGPUVirtualAddress());
+    CommandList->RSSetViewports(1, &Viewport);
+    CommandList->RSSetScissorRects(1, &ScissorRect);
+
+    // Indicate that the buffer will be used as a render target.
+    RecordTransitionBarrier(CommandList, RenderTarget,
+                            D3D12_RESOURCE_STATE_COPY_DEST,
+                            D3D12_RESOURCE_STATE_RENDER_TARGET);
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE RtvHandle(
+        RtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, RtvDescSize);
+    CommandList->OMSetRenderTargets(1, &RtvHandle, FALSE, nullptr);
+
+    CommandList->ClearRenderTargetView(RtvHandle, ClearColor, 0, nullptr);
+    CommandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+    CommandList->DrawInstanced(3, 1, 0, 0);
+  }
+
   RecordTransitionBarrier(CommandList, UavResource,
                           D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
                           D3D12_RESOURCE_STATE_COPY_SOURCE);
@@ -12713,7 +12828,8 @@ void main(uint threadIdx : SV_GroupThreadID)
             fabs(Result - Expected) > 0.00001) {
           LogErrorFmt(L"Result mismatch at index %d",
                       i * Config.OutputPerThread + j);
-          LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected);
+          LogErrorFmt(L"Result: %f, Expected: %f  (stage: %s)", Result,
+                      Expected, RunCompute ? L"compute" : L"pixel");
           Equal = false;
         }
       }
@@ -12923,7 +13039,6 @@ RWByteAddressBuffer AccumMatrix : register(u0);
 [numthreads(NUM_THREADS, 1, 1)]
 void main(uint threadIdx : SV_GroupThreadID)
 {
-#if 1
   using namespace dx::linalg;
 
   // Ensure 4-byte alignment for vector loads
@@ -12936,7 +13051,6 @@ void main(uint threadIdx : SV_GroupThreadID)
   RWMatrixRef<MATRIX_DATA_TYPE_ENUM, DIM_M, DIM_N, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { AccumMatrix, 0, STRIDE };
 
   OuterProductAccumulate(input1, input2, mat);
-#endif
 }
     )";
 

From ac28b864d8398c64413551fcc605d19bfc4ba8cf Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 15:56:57 -0400
Subject: [PATCH 08/26] Support pixel shaders in OuterProduct tests

---
 .../unittests/HLSLExec/ExecutionTest.cpp      | 310 ++++++++++++------
 1 file changed, 211 insertions(+), 99 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 51206893e9..191899457f 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -814,7 +814,7 @@ class ExecutionTest {
   void runCoopVecOuterProductSubtest(
       ID3D12Device *D3DDevice,
       D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps,
-      CoopVecOuterProductSubtestConfig &Config);
+      CoopVecOuterProductSubtestConfig &Config, bool RunCompute);
 
 #endif // HAVE_COOPVEC_API
 
@@ -12913,29 +12913,41 @@ void ExecutionTest::runCoopVecOuterProductTestConfig(
       continue;
     }
 
-    runCoopVecOuterProductSubtest(D3DDevice, AccumulateProps, Config);
+    // Run once in compute, then once in graphics (pixel shader)
+    runCoopVecOuterProductSubtest(D3DDevice, AccumulateProps, Config, true);
+    runCoopVecOuterProductSubtest(D3DDevice, AccumulateProps, Config, false);
   }
 }
 
 void ExecutionTest::runCoopVecOuterProductSubtest(
     ID3D12Device *D3DDevice,
     D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps,
-    CoopVecOuterProductSubtestConfig &Config) {
+    CoopVecOuterProductSubtestConfig &Config, bool RunCompute) {
 
   LogCommentFmt(
-      L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s",
+      L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s, "
+      L"Stage: %s",
       Config.DimM, Config.DimN, Config.NumThreads,
-      CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str());
+      CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(),
+      RunCompute ? L"Compute" : L"Pixel");
 
   // Create root signature with a single root entry for all SRVs and UAVs
   CComPtr<ID3D12RootSignature> RootSignature;
   {
-    CD3DX12_DESCRIPTOR_RANGE ranges[2];
-    ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 0,
-                   0); // InputVector1, InputVector2
-    ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // AccumMatrix
-    CreateRootSignatureFromRanges(D3DDevice, &RootSignature, ranges, 2, nullptr,
-                                  0);
+    CD3DX12_DESCRIPTOR_RANGE Ranges[2];
+    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2, 0, 0);
+    Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0);
+
+    CD3DX12_ROOT_PARAMETER RootParams[2];
+    RootParams[0].InitAsDescriptorTable(_countof(Ranges), Ranges,
+                                        D3D12_SHADER_VISIBILITY_ALL);
+    RootParams[1].InitAsUnorderedAccessView(/* register */ 10, /* space */ 0,
+                                            D3D12_SHADER_VISIBILITY_ALL);
+
+    CD3DX12_ROOT_SIGNATURE_DESC RootSignatureDesc;
+    RootSignatureDesc.Init(_countof(RootParams), RootParams, 0, nullptr,
+                           D3D12_ROOT_SIGNATURE_FLAG_NONE);
+    CreateRootSignatureFromDesc(D3DDevice, &RootSignatureDesc, &RootSignature);
   }
 
   // Create descriptor heap with space for 3 descriptors: 2 SRVs and 1 UAV
@@ -13027,17 +13039,17 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
 
   // Create a compute pipeline state object.
   CComPtr<ID3D12PipelineState> ComputePipelineState;
-  {
-    std::string ShaderSource = R"(
+
+  std::string ShaderSource = R"(
 #include "dx/linalg.h"
 
 ByteAddressBuffer InputVector1 : register(t0);
 ByteAddressBuffer InputVector2 : register(t1);
 RWByteAddressBuffer AccumMatrix : register(u0);
 
-[shader("compute")]
-[numthreads(NUM_THREADS, 1, 1)]
-void main(uint threadIdx : SV_GroupThreadID)
+RWStructuredBuffer<uint> AtomicCounter : register(u10);
+
+void RunCoopVecTest(uint threadIdx)
 {
   using namespace dx::linalg;
 
@@ -13052,94 +13064,142 @@ void main(uint threadIdx : SV_GroupThreadID)
 
   OuterProductAccumulate(input1, input2, mat);
 }
-    )";
 
-    auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
-      std::wstringstream Stream;
-      Stream << L"-D" << Name << L"=" << Value;
-      return Stream.str();
-    };
+[shader("compute")]
+[numthreads(NUM_THREADS, 1, 1)]
+void main(uint threadIdx : SV_GroupThreadID)
+{
+  RunCoopVecTest(threadIdx);
+}
 
-    auto CreateDefineFromString = [](const wchar_t *Name,
-                                     const wchar_t *Value) {
-      std::wstringstream Stream;
-      Stream << L"-D" << Name << L"=" << Value;
-      return Stream.str();
-    };
+float4 vs_main(uint vid : SV_VertexID) : SV_Position {
+  switch (vid) {
+  case 0:
+    return float4(-1, 1, 0, 0);
+  case 1:
+    return float4(3, 1, 0, 0);
+  case 2:
+    return float4(-1, -3, 0, 0);
+  }
+  return float4(0, 0, 0, 0);
+}
 
-    int Stride = 0;
-    const std::wstring HlslMatrixLayout =
-        CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout);
-    int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType(
-        AccumulateProps.AccumulationType);
-    switch (Config.MatrixLayout) {
-    case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR:
-      Stride = Config.DimN * StrideMultiplier;
-      break;
-    case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR:
-      Stride = Config.DimM * StrideMultiplier;
-      break;
-    }
+float4 ps_main() : SV_Target {
+  uint threadIdx;
+  InterlockedAdd(AtomicCounter[0], 1, threadIdx);
+  RunCoopVecTest(threadIdx);
+  return float4(1, 1, 1, 1);
+}
+)";
 
-    const int InputDivisor =
-        CoopVecHelpers::GetNumPackedElementsForInputDataType(
-            AccumulateProps.InputType);
-    const std::wstring InputDataType =
-        CoopVecHelpers::GetHlslDataTypeForDataType(AccumulateProps.InputType);
-    const std::wstring AccumDataType =
-        CoopVecHelpers::GetHlslDataTypeForDataType(
-            AccumulateProps.AccumulationType);
-    const std::wstring MatrixDataTypeEnum =
-        CoopVecHelpers::GetHlslInterpretationForDataType(
-            AccumulateProps.AccumulationType);
-    const std::wstring InputInterpretationEnum =
-        CoopVecHelpers::GetHlslInterpretationForDataType(
-            AccumulateProps.InputType);
-
-    auto DimMDefine = CreateDefineFromInt(L"DIM_M", Config.DimM);
-    auto DimNDefine = CreateDefineFromInt(L"DIM_N", Config.DimN);
-    auto NumThreadsDefine =
-        CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads);
-    auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
-    auto InputDataTypeDefine =
-        CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType.c_str());
-    auto InputDivisorDefine =
-        CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor);
-    auto AccumDataTypeDefine =
-        CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType.c_str());
-    auto InputInterpretationEnumDefine = CreateDefineFromString(
-        L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum.c_str());
-    auto HlslMatrixLayoutDefine =
-        CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str());
-    auto MatrixDataTypeEnumDefine = CreateDefineFromString(
-        L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str());
-    auto InputVector1StrideDefine = CreateDefineFromInt(
-        L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride());
-    auto InputVector2StrideDefine = CreateDefineFromInt(
-        L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride());
-
-    LPCWSTR Options[] = {
-        L"-enable-16bit-types",
-        DimMDefine.c_str(),
-        DimNDefine.c_str(),
-        NumThreadsDefine.c_str(),
-        StrideDefine.c_str(),
-        InputDataTypeDefine.c_str(),
-        InputDivisorDefine.c_str(),
-        AccumDataTypeDefine.c_str(),
-        InputInterpretationEnumDefine.c_str(),
-        HlslMatrixLayoutDefine.c_str(),
-        MatrixDataTypeEnumDefine.c_str(),
-        InputVector1StrideDefine.c_str(),
-        InputVector2StrideDefine.c_str(),
-    };
+  auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
+    std::wstringstream Stream;
+    Stream << L"-D" << Name << L"=" << Value;
+    return Stream.str();
+  };
+
+  auto CreateDefineFromString = [](const wchar_t *Name, const wchar_t *Value) {
+    std::wstringstream Stream;
+    Stream << L"-D" << Name << L"=" << Value;
+    return Stream.str();
+  };
+
+  int Stride = 0;
+  const std::wstring HlslMatrixLayout =
+      CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout);
+  int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType(
+      AccumulateProps.AccumulationType);
+  switch (Config.MatrixLayout) {
+  case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR:
+    Stride = Config.DimN * StrideMultiplier;
+    break;
+  case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR:
+    Stride = Config.DimM * StrideMultiplier;
+    break;
+  }
 
-    CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
-        new LinAlgHeaderIncludeHandler(m_support);
+  const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType(
+      AccumulateProps.InputType);
+  const std::wstring InputDataType =
+      CoopVecHelpers::GetHlslDataTypeForDataType(AccumulateProps.InputType);
+  const std::wstring AccumDataType = CoopVecHelpers::GetHlslDataTypeForDataType(
+      AccumulateProps.AccumulationType);
+  const std::wstring MatrixDataTypeEnum =
+      CoopVecHelpers::GetHlslInterpretationForDataType(
+          AccumulateProps.AccumulationType);
+  const std::wstring InputInterpretationEnum =
+      CoopVecHelpers::GetHlslInterpretationForDataType(
+          AccumulateProps.InputType);
 
+  auto DimMDefine = CreateDefineFromInt(L"DIM_M", Config.DimM);
+  auto DimNDefine = CreateDefineFromInt(L"DIM_N", Config.DimN);
+  auto NumThreadsDefine =
+      CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads);
+  auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
+  auto InputDataTypeDefine =
+      CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType.c_str());
+  auto InputDivisorDefine = CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor);
+  auto AccumDataTypeDefine =
+      CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType.c_str());
+  auto InputInterpretationEnumDefine = CreateDefineFromString(
+      L"INPUT_INTERPRETATION_ENUM", InputInterpretationEnum.c_str());
+  auto HlslMatrixLayoutDefine =
+      CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str());
+  auto MatrixDataTypeEnumDefine = CreateDefineFromString(
+      L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str());
+  auto InputVector1StrideDefine = CreateDefineFromInt(
+      L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride());
+  auto InputVector2StrideDefine = CreateDefineFromInt(
+      L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride());
+
+  LPCWSTR Options[] = {
+      L"-enable-16bit-types",
+      DimMDefine.c_str(),
+      DimNDefine.c_str(),
+      NumThreadsDefine.c_str(),
+      StrideDefine.c_str(),
+      InputDataTypeDefine.c_str(),
+      InputDivisorDefine.c_str(),
+      AccumDataTypeDefine.c_str(),
+      InputInterpretationEnumDefine.c_str(),
+      HlslMatrixLayoutDefine.c_str(),
+      MatrixDataTypeEnumDefine.c_str(),
+      InputVector1StrideDefine.c_str(),
+      InputVector2StrideDefine.c_str(),
+  };
+
+  CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
+      new LinAlgHeaderIncludeHandler(m_support);
+
+  if (RunCompute) {
     CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9",
                      &ComputePipelineState, Options, _countof(Options),
                      IncludeHandler);
+  } else {
+    CComPtr<ID3DBlob> VertexShader;
+    CComPtr<ID3DBlob> PixelShader;
+
+    CompileFromText(ShaderSource.c_str(), L"vs_main", L"vs_6_9", &VertexShader,
+                    Options, _countof(Options), IncludeHandler);
+    CompileFromText(ShaderSource.c_str(), L"ps_main", L"ps_6_9", &PixelShader,
+                    Options, _countof(Options), IncludeHandler);
+
+    D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {};
+    // psoDesc.InputLayout;
+    PsoDesc.pRootSignature = RootSignature;
+    PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader);
+    PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader);
+    PsoDesc.RasterizerState = CD3DX12_RASTERIZER_DESC(D3D12_DEFAULT);
+    PsoDesc.BlendState = CD3DX12_BLEND_DESC(D3D12_DEFAULT);
+    PsoDesc.DepthStencilState.DepthEnable = FALSE;
+    PsoDesc.DepthStencilState.StencilEnable = FALSE;
+    PsoDesc.SampleMask = UINT_MAX;
+    PsoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE;
+    PsoDesc.NumRenderTargets = 1;
+    PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
+    PsoDesc.SampleDesc.Count = 1;
+    VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState(
+        &PsoDesc, IID_PPV_ARGS(&ComputePipelineState)));
   }
 
   // Create a command list for the compute shader.
@@ -13282,6 +13342,14 @@ void main(uint threadIdx : SV_GroupThreadID)
                  ConvertedMatrixResource);
   }
 
+  // Create resource for atomic counter
+  CComPtr<ID3D12Resource> AtomicCounterResource;
+  uint32_t AtomicCounterInit = 0;
+  CreateTestResources(D3DDevice, CommandList, &AtomicCounterInit,
+                      sizeof(AtomicCounterInit),
+                      CD3DX12_RESOURCE_DESC::Buffer(sizeof(AtomicCounterInit)),
+                      &AtomicCounterResource, nullptr);
+
   CommandList->Close();
   ExecuteCommandList(CommandQueue, CommandList);
   WaitForSignal(CommandQueue, FO);
@@ -13293,10 +13361,54 @@ void main(uint threadIdx : SV_GroupThreadID)
   CD3DX12_GPU_DESCRIPTOR_HANDLE ResHandle(
       DescriptorHeap->GetGPUDescriptorHandleForHeapStart());
 
-  CommandList->SetComputeRootSignature(RootSignature);
-  CommandList->SetComputeRootDescriptorTable(0, ResHandle);
-  CommandList->SetPipelineState(ComputePipelineState);
-  CommandList->Dispatch(1, 1, 1);
+  CComPtr<ID3D12DescriptorHeap> RtvHeap;
+  CComPtr<ID3D12Resource> RenderTarget;
+  CComPtr<ID3D12Resource> RenderTargetRead;
+
+  if (RunCompute) {
+    CommandList->SetComputeRootSignature(RootSignature);
+    CommandList->SetComputeRootDescriptorTable(0, ResHandle);
+    CommandList->SetPipelineState(ComputePipelineState);
+    CommandList->Dispatch(1, 1, 1);
+  } else {
+    UINT FrameCount = 1;
+    UINT RtvDescSize = 0;
+    CreateRtvDescriptorHeap(D3DDevice, FrameCount, &RtvHeap, &RtvDescSize);
+    CreateRenderTargetAndReadback(D3DDevice, RtvHeap, 100, 100, &RenderTarget,
+                                  &RenderTargetRead);
+
+    D3D12_RESOURCE_DESC RtDesc = RenderTarget->GetDesc();
+    D3D12_VIEWPORT Viewport;
+    D3D12_RECT ScissorRect;
+
+    memset(&Viewport, 0, sizeof(Viewport));
+    Viewport.Height = (float)RtDesc.Height;
+    Viewport.Width = (float)RtDesc.Width;
+    Viewport.MaxDepth = 1.0f;
+    memset(&ScissorRect, 0, sizeof(ScissorRect));
+    ScissorRect.right = (long)RtDesc.Width;
+    ScissorRect.bottom = RtDesc.Height;
+    CommandList->SetGraphicsRootSignature(RootSignature);
+    CommandList->SetGraphicsRootDescriptorTable(0, ResHandle);
+    CommandList->SetGraphicsRootUnorderedAccessView(
+        1, AtomicCounterResource->GetGPUVirtualAddress());
+    CommandList->RSSetViewports(1, &Viewport);
+    CommandList->RSSetScissorRects(1, &ScissorRect);
+
+    // Indicate that the buffer will be used as a render target.
+    RecordTransitionBarrier(CommandList, RenderTarget,
+                            D3D12_RESOURCE_STATE_COPY_DEST,
+                            D3D12_RESOURCE_STATE_RENDER_TARGET);
+
+    CD3DX12_CPU_DESCRIPTOR_HANDLE RtvHandle(
+        RtvHeap->GetCPUDescriptorHandleForHeapStart(), 0, RtvDescSize);
+    CommandList->OMSetRenderTargets(1, &RtvHandle, FALSE, nullptr);
+
+    CommandList->ClearRenderTargetView(RtvHandle, ClearColor, 0, nullptr);
+    CommandList->IASetPrimitiveTopology(D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+    CommandList->DrawInstanced(3, 1, 0, 0);
+  }
+
   CommandList->Close();
   ExecuteCommandList(CommandQueue, CommandList);
   WaitForSignal(CommandQueue, FO);

From 07a32cbbd6091e36df2b472b4f585711c4e48f35 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 15:59:05 -0400
Subject: [PATCH 09/26] logging fix

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 191899457f..123cf2a8ef 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12350,10 +12350,11 @@ void ExecutionTest::runCoopVecMulSubtest(
 
   LogCommentFmt(
       L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: "
-      L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s",
+      L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s, Stage: %s",
       Config.InputPerThread, Config.OutputPerThread, Config.NumThreads,
       Config.NumLayers, Config.Bias ? L"true" : L"false",
-      CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str());
+      CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(),
+      RunCompute ? L"Compute" : L"Pixel");
 
   const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4);
 
@@ -12828,8 +12829,7 @@ float4 ps_main() : SV_Target {
             fabs(Result - Expected) > 0.00001) {
           LogErrorFmt(L"Result mismatch at index %d",
                       i * Config.OutputPerThread + j);
-          LogErrorFmt(L"Result: %f, Expected: %f  (stage: %s)", Result,
-                      Expected, RunCompute ? L"compute" : L"pixel");
+          LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected);
           Equal = false;
         }
       }

From a2731a3ab04229baca908a52e374f67b60c1b5d1 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 16:01:45 -0400
Subject: [PATCH 10/26] s/ComputePipelineState/PipelineState/

---
 .../unittests/HLSLExec/ExecutionTest.cpp      | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 123cf2a8ef..903365914f 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12423,7 +12423,7 @@ void ExecutionTest::runCoopVecMulSubtest(
   }
 
   // Create the compute pipeline state for the CoopVec shader
-  CComPtr<ID3D12PipelineState> ComputePipelineState;
+  CComPtr<ID3D12PipelineState> PipelineState;
 
   std::string ShaderSource = R"(
 #include "dx/linalg.h"
@@ -12612,7 +12612,7 @@ float4 ps_main() : SV_Target {
 
   if (RunCompute) {
     CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9",
-                     &ComputePipelineState, Options, _countof(Options),
+                     &PipelineState, Options, _countof(Options),
                      IncludeHandler);
   } else {
     CComPtr<ID3DBlob> VertexShader;
@@ -12638,7 +12638,7 @@ float4 ps_main() : SV_Target {
     PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
     PsoDesc.SampleDesc.Count = 1;
     VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState(
-        &PsoDesc, IID_PPV_ARGS(&ComputePipelineState)));
+        &PsoDesc, IID_PPV_ARGS(&PipelineState)));
   }
 
   // Create a command list for the compute shader.
@@ -12652,7 +12652,7 @@ float4 ps_main() : SV_Target {
   VERIFY_SUCCEEDED(D3DDevice->CreateCommandAllocator(
       D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&CommandAllocator)));
   VERIFY_SUCCEEDED(D3DDevice->CreateCommandList(
-      0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
+      0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, PipelineState,
       IID_PPV_ARGS(&CommandList)));
 
   std::vector<CComPtr<ID3D12Resource>> InputMatrixSRVResources(
@@ -12752,7 +12752,7 @@ float4 ps_main() : SV_Target {
   ExecuteCommandList(CommandQueue, CommandList);
   WaitForSignal(CommandQueue, FO);
   VERIFY_SUCCEEDED(CommandAllocator->Reset());
-  VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, ComputePipelineState));
+  VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, PipelineState));
 
   SetDescriptorHeap(CommandList, DescriptorHeap);
 
@@ -12766,7 +12766,7 @@ float4 ps_main() : SV_Target {
   if (RunCompute) {
     CommandList->SetComputeRootSignature(RootSignature);
     CommandList->SetComputeRootDescriptorTable(0, ResHandle);
-    CommandList->SetPipelineState(ComputePipelineState);
+    CommandList->SetPipelineState(PipelineState);
     CommandList->Dispatch(1, 1, 1);
   } else {
     UINT FrameCount = 1;
@@ -13038,7 +13038,7 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
   }
 
   // Create a compute pipeline state object.
-  CComPtr<ID3D12PipelineState> ComputePipelineState;
+  CComPtr<ID3D12PipelineState> PipelineState;
 
   std::string ShaderSource = R"(
 #include "dx/linalg.h"
@@ -13173,7 +13173,7 @@ float4 ps_main() : SV_Target {
 
   if (RunCompute) {
     CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9",
-                     &ComputePipelineState, Options, _countof(Options),
+                     &PipelineState, Options, _countof(Options),
                      IncludeHandler);
   } else {
     CComPtr<ID3DBlob> VertexShader;
@@ -13199,7 +13199,7 @@ float4 ps_main() : SV_Target {
     PsoDesc.RTVFormats[0] = DXGI_FORMAT_R8G8B8A8_UNORM;
     PsoDesc.SampleDesc.Count = 1;
     VERIFY_SUCCEEDED(D3DDevice->CreateGraphicsPipelineState(
-        &PsoDesc, IID_PPV_ARGS(&ComputePipelineState)));
+        &PsoDesc, IID_PPV_ARGS(&PipelineState)));
   }
 
   // Create a command list for the compute shader.
@@ -13213,7 +13213,7 @@ float4 ps_main() : SV_Target {
   VERIFY_SUCCEEDED(D3DDevice->CreateCommandAllocator(
       D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&CommandAllocator)));
   VERIFY_SUCCEEDED(D3DDevice->CreateCommandList(
-      0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
+      0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, PipelineState,
       IID_PPV_ARGS(&CommandList)));
 
   CComPtr<ID3D12Resource> InputMatrixSRVResource, InputMatrixSRVUploadResource;
@@ -13354,7 +13354,7 @@ float4 ps_main() : SV_Target {
   ExecuteCommandList(CommandQueue, CommandList);
   WaitForSignal(CommandQueue, FO);
   VERIFY_SUCCEEDED(CommandAllocator->Reset());
-  VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, ComputePipelineState));
+  VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, PipelineState));
 
   SetDescriptorHeap(CommandList, DescriptorHeap);
 
@@ -13368,7 +13368,7 @@ float4 ps_main() : SV_Target {
   if (RunCompute) {
     CommandList->SetComputeRootSignature(RootSignature);
     CommandList->SetComputeRootDescriptorTable(0, ResHandle);
-    CommandList->SetPipelineState(ComputePipelineState);
+    CommandList->SetPipelineState(PipelineState);
     CommandList->Dispatch(1, 1, 1);
   } else {
     UINT FrameCount = 1;
@@ -13414,7 +13414,7 @@ float4 ps_main() : SV_Target {
   WaitForSignal(CommandQueue, FO);
 
   VERIFY_SUCCEEDED(CommandAllocator->Reset());
-  VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, ComputePipelineState));
+  VERIFY_SUCCEEDED(CommandList->Reset(CommandAllocator, PipelineState));
 
   // Convert matrix to sint8/fp32 row-major format before reading back to the
   // CPU. A new resource is created, along with a readback resource, for the

From f5bfc88d5414db30084a22dcc3f42f1009a9e6fc Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 16:05:35 -0400
Subject: [PATCH 11/26] Add some more sizes to OuterProduct test

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 903365914f..19f7f660a5 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12899,7 +12899,14 @@ void ExecutionTest::runCoopVecOuterProductTestConfig(
           .c_str());
 
   constexpr CoopVecOuterProductSubtestConfig TestConfigs[] = {
-      {4, 4, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {4, 4, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {4, 4, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {16, 16, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {16, 16, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {32, 32, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {32, 32, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {64, 64, 16, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
+      {64, 64, 32, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL},
   };
 
   for (auto Config : TestConfigs) {

From ad20ee604943bc45bd8d0e024559864264107916 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 16:16:27 -0400
Subject: [PATCH 12/26] pixel shader bounds checks

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 19f7f660a5..bab6ab917b 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12509,6 +12509,13 @@ float4 vs_main(uint vid : SV_VertexID) : SV_Position {
 float4 ps_main() : SV_Target {
   uint threadIdx;
   InterlockedAdd(AtomicCounter[0], 1, threadIdx);
+  // threadIdx may exceed NUM_THREADS, but bounds checking on the vector
+  // loads/stores will prevent any faults from occurring. This lets us
+  // exercise the CoopVec implementation on more threads, giving us
+  // further confidence that there are no bad interactions between "good"
+  // threads and threads that fail bounds checking and operate on all-zero
+  // input data. This also gives us some additional testing of long vector
+  // bounds-checking.
   RunCoopVecTest(threadIdx);
   return float4(1, 1, 1, 1);
 }
@@ -13094,7 +13101,8 @@ float4 vs_main(uint vid : SV_VertexID) : SV_Position {
 float4 ps_main() : SV_Target {
   uint threadIdx;
   InterlockedAdd(AtomicCounter[0], 1, threadIdx);
-  RunCoopVecTest(threadIdx);
+  if (threadIdx < NUM_THREADS)
+    RunCoopVecTest(threadIdx);
   return float4(1, 1, 1, 1);
 }
 )";

From e70a45fca04bfc06d05506315633601ffaea635c Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Thu, 8 May 2025 19:38:44 -0400
Subject: [PATCH 13/26] Implement loading/storing input/output vectors through
 groupshared memory and improved input vector/matrix test patterns

---
 tools/clang/unittests/HLSLExec/CoopVec.h      |  90 ++++++++----
 .../unittests/HLSLExec/ExecutionTest.cpp      | 132 +++++++++++-------
 2 files changed, 146 insertions(+), 76 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index 18b8669197..bbcc5b8b96 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -6,6 +6,7 @@
 #include <DirectXPackedVector.h>
 
 #include <cstdlib>
+#include <random>
 #include <vector>
 
 #include "dxc/Support/microcom.h"
@@ -358,6 +359,15 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) {
   }
 }
 
+bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
+  return DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+         DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 ||
+         DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16 ||
+         DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16 ||
+         DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 ||
+         DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32;
+}
+
 struct TestVector {
 private:
   size_t NumVectors = 0;
@@ -534,31 +544,51 @@ struct TestVector {
     }
   }
 
-  template <typename T> void fillSimpleTestData() {
-    // Create a vector of (1, 1, 0, ...)
+  template <typename T>
+  void fillSimpleTestData(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation,
+                          std::mt19937 &Rnd) {
     for (size_t I = 0; I < NumVectors; ++I) {
       T *Vec = getVector<T>(I);
       for (size_t J = 0; J < VectorSize; ++J)
-        if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
-          // Special case for HALF, which requires conversion from float
-          Vec[J] = static_cast<T>(
-              ConvertFloat32ToFloat16((J == 0 || J == 1) ? 1.0f : 0.0f));
+        if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF> ||
+                      std::is_same_v<T, float>) {
+          float Elt = 0.0f;
+          if (IsIntegralDataType(MatrixInterpretation)) {
+            Elt = (float)(Rnd() & 0x7) - 3.0f;
+          } else {
+            Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f;
+          }
+          if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
+            Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(Elt));
+          } else {
+            Vec[J] = static_cast<T>(Elt);
+          }
         } else {
-          Vec[J] = static_cast<T>((J == 0 || J == 1) ? 1 : 0);
+          if constexpr (std::is_signed_v<T>) {
+            Vec[J] = static_cast<T>((int32_t)(Rnd() & 0xf) - 8);
+          } else {
+            Vec[J] = static_cast<T>((uint32_t)(Rnd() & 0xf));
+          }
         }
     }
   }
 
-  template <typename T> void fillAllOnesTestData() {
-    // Create a vector of (1, 1, 1, ...)
+  template <typename T> void FillSimpleMatrixTestData(std::mt19937 &Rnd) {
     for (size_t I = 0; I < NumVectors; ++I) {
       T *Vec = getVector<T>(I);
       for (size_t J = 0; J < VectorSize; ++J)
         if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
-          // Special case for HALF, which requires conversion from float
-          Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(1.0f));
+          float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f;
+          Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(Elt));
+        } else if constexpr (std::is_same_v<T, float>) {
+          float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f;
+          Vec[J] = static_cast<T>(Elt);
         } else {
-          Vec[J] = static_cast<T>(1);
+          if constexpr (std::is_signed_v<T>) {
+            Vec[J] = static_cast<T>((int32_t)(Rnd() & 0xf) - 8);
+          } else {
+            Vec[J] = static_cast<T>((uint32_t)(Rnd() & 0xf));
+          }
         }
     }
   }
@@ -566,7 +596,9 @@ struct TestVector {
   static TestVector
   createSimpleTestVector(size_t NumVectors, size_t VectorSize,
                          D3D12_LINEAR_ALGEBRA_DATATYPE DataType,
-                         D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+                         D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation,
+                         D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation,
+                         std::mt19937 &Rnd) {
     size_t ElementSize;
     switch (DataType) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
@@ -600,35 +632,36 @@ struct TestVector {
     TestVector Vec(NumVectors, VectorSize, ElementSize);
     switch (DataType) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
-      Vec.fillSimpleTestData<int8_t>();
+      Vec.fillSimpleTestData<int8_t>(MatrixInterpretation, Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
-      Vec.fillSimpleTestData<uint8_t>();
+      Vec.fillSimpleTestData<uint8_t>(MatrixInterpretation, Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
-      Vec.fillSimpleTestData<int16_t>();
+      Vec.fillSimpleTestData<int16_t>(MatrixInterpretation, Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
-      Vec.fillSimpleTestData<uint16_t>();
+      Vec.fillSimpleTestData<uint16_t>(MatrixInterpretation, Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
-      Vec.fillSimpleTestData<int32_t>();
+      Vec.fillSimpleTestData<int32_t>(MatrixInterpretation, Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
       if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
           DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) {
-        Vec.fillSimpleTestData<uint8_t>();
+        Vec.fillSimpleTestData<uint8_t>(MatrixInterpretation, Rnd);
       } else {
-        Vec.fillSimpleTestData<uint32_t>();
+        Vec.fillSimpleTestData<uint32_t>(MatrixInterpretation, Rnd);
       }
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
-      Vec.fillSimpleTestData<DirectX::PackedVector::HALF>();
+      Vec.fillSimpleTestData<DirectX::PackedVector::HALF>(MatrixInterpretation,
+                                                          Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
-      Vec.fillSimpleTestData<float>();
+      Vec.fillSimpleTestData<float>(MatrixInterpretation, Rnd);
       break;
     default:
       throw std::invalid_argument("Unsupported data type");
@@ -638,7 +671,8 @@ struct TestVector {
 
   static TestVector
   createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize,
-                          D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+                          D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation,
+                          std::mt19937 &Rnd) {
     size_t ElementSize;
     switch (DataInterpretation) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
@@ -666,13 +700,13 @@ struct TestVector {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
-      Vec.fillAllOnesTestData<int8_t>();
+      Vec.FillSimpleMatrixTestData<int8_t>(Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
-      Vec.fillAllOnesTestData<float>();
+      Vec.FillSimpleMatrixTestData<float>(Rnd);
       break;
     default:
       throw std::invalid_argument("Unsupported data type");
@@ -724,10 +758,12 @@ struct TestVector {
     ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize();
 
     if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
-      ConvertInfo.DestInfo.DestStride = (UINT)getVectorSize() * DestEltSize;
+      ConvertInfo.DestInfo.DestStride =
+          ((UINT)getVectorSize() * DestEltSize + 15) & ~15;
     } else if (MatrixLayout ==
                D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
-      ConvertInfo.DestInfo.DestStride = (UINT)getNumVectors() * DestEltSize;
+      ConvertInfo.DestInfo.DestStride =
+          ((UINT)getNumVectors() * DestEltSize + 15) & ~15;
     }
 
     // Get destination size using preview interface
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index bab6ab917b..a58137a63f 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12348,6 +12348,8 @@ void ExecutionTest::runCoopVecMulSubtest(
     ID3D12Device *D3DDevice, D3D12_COOPERATIVE_VECTOR_PROPERTIES_MUL &MulProps,
     CoopVecMulSubtestConfig &Config, bool RunCompute) {
 
+  std::mt19937 Rnd(0x42);
+
   LogCommentFmt(
       L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: "
       L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s, Stage: %s",
@@ -12399,19 +12401,20 @@ void ExecutionTest::runCoopVecMulSubtest(
     InputMatrices.push_back(
         ::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
             Config.InputPerThread, Config.InputPerThread,
-            MulProps.MatrixInterpretation));
+            MulProps.MatrixInterpretation, Rnd));
   }
   // Last layer, matrix size is OutputPerThread x InputPerThread
   InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
       Config.OutputPerThread, Config.InputPerThread,
-      MulProps.MatrixInterpretation));
+      MulProps.MatrixInterpretation, Rnd));
 
   auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector(
       Config.NumThreads, Config.InputPerThread, MulProps.InputType,
-      MulProps.InputInterpretation);
+      MulProps.InputInterpretation, MulProps.MatrixInterpretation, Rnd);
   auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector(
       1, std::max(Config.OutputPerThread, Config.InputPerThread),
-      MulProps.BiasInterpretation, MulProps.BiasInterpretation);
+      MulProps.BiasInterpretation, MulProps.BiasInterpretation,
+      MulProps.MatrixInterpretation, Rnd);
 
   // Calculate reference output
   auto ExpectedOutput = InputVector;
@@ -12435,20 +12438,32 @@ RWByteAddressBuffer OutputBuffer: register(u0);
 
 RWStructuredBuffer<uint> AtomicCounter : register(u10);
 
+#if USE_GROUPSHARED
+groupshared vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> inputGS[NUM_THREADS];
+groupshared vector<float, OUTPUT_PER_THREAD> outputGS[NUM_THREADS];
+#endif
+
 void RunCoopVecTest(uint threadIdx)
 {
   using namespace dx::linalg;
 
   uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE);
   vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> >(inputOffset);
-  VectorRef<BIAS_INTERPRETATION_ENUM> biasVec = { InputBias, 0 };
 
+#if USE_GROUPSHARED
+  // Use groupshared memory to grab the "next" thread's input vector.
+  inputGS[threadIdx] = input;
+  GroupMemoryBarrierWithGroupSync();
+  input = inputGS[(threadIdx + 1) % NUM_THREADS];
+#endif
+
+  VectorRef<BIAS_INTERPRETATION_ENUM> biasVec = { InputBias, 0 };
   vector<ACCUM_DATA_TYPE, OUTPUT_PER_THREAD> output;
 )";
 
   if (Config.NumLayers == 1) {
     ShaderSource += R"(
-  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix[0], 0, STRIDE };
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix[0], 0, STRIDE0 };
 
   if (USE_BIAS) {
     output = MulAdd<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
@@ -12460,17 +12475,17 @@ void RunCoopVecTest(uint threadIdx)
     ShaderSource += R"(
   vector<ACCUM_DATA_TYPE, INPUT_PER_THREAD> accum;
 
-  MatrixRef<MATRIX_DATA_TYPE_ENUM, INPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat0 = { InputMatrix[0], 0, STRIDE };
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, INPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat0 = { InputMatrix[0], 0, STRIDE0 };
   if (USE_BIAS) {
     accum = MulAdd<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
   } else {
     accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
   }
 
-  // Dummy activation function; all of our intermediates are positive (currently).
-  accum = max(accum, 0);
+  // Dummy activation function; all of our intermediates above -10000
+  accum = max(accum, -10000);
 
-  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat1 = { InputMatrix[1], 0, STRIDE };
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat1 = { InputMatrix[1], 0, STRIDE1 };
   if (USE_BIAS) {
     output = MulAdd<ACCUM_DATA_TYPE>(mat1, MakeInterpretedVector<ACCUM_INTERPRETATION_ENUM>(accum), biasVec);
   } else {
@@ -12482,6 +12497,13 @@ void RunCoopVecTest(uint threadIdx)
   ShaderSource += R"(
   vector<float, OUTPUT_PER_THREAD> result = (vector<float, OUTPUT_PER_THREAD>)output;
 
+#if USE_GROUPSHARED
+  // Use groupshared memory to grab the "previous" thread's output vector.
+  outputGS[threadIdx] = result;
+  GroupMemoryBarrierWithGroupSync();
+  result = outputGS[(threadIdx + NUM_THREADS - 1) % NUM_THREADS];
+#endif
+
   // Ensure 4-byte alignment for vector store
   uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float);
   OutputBuffer.Store<vector<float, OUTPUT_PER_THREAD> >(outputOffset, result);
@@ -12534,20 +12556,8 @@ float4 ps_main() : SV_Target {
     return Stream.str();
   };
 
-  int Stride = 0;
   const std::wstring HlslMatrixLayout =
       CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout);
-  int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType(
-      MulProps.MatrixInterpretation);
-  switch (Config.MatrixLayout) {
-  case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR:
-    Stride = Config.InputPerThread * StrideMultiplier;
-    break;
-  case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR:
-    Stride = Config.OutputPerThread * StrideMultiplier;
-    break;
-  }
-
   const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType(
       MulProps.InputInterpretation);
   const std::wstring InputDataType =
@@ -12570,7 +12580,6 @@ float4 ps_main() : SV_Target {
       CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread);
   auto NumThreadsDefine =
       CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads);
-  auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
   auto InputDataTypeDefine =
       CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType);
   auto InputDivisorDefine = CreateDefineFromInt(
@@ -12594,13 +12603,14 @@ float4 ps_main() : SV_Target {
   auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers);
   auto BiasInterpretationEnumDefine = CreateDefineFromString(
       L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum);
+  auto UseGroupsharedDefine =
+      CreateDefineFromInt(L"USE_GROUPSHARED", RunCompute ? 1 : 0);
 
-  LPCWSTR Options[] = {
+  std::vector<LPCWSTR> Options = {
       L"-enable-16bit-types",
       InputPerThreadDefine.c_str(),
       OutputPerThreadDefine.c_str(),
       NumThreadsDefine.c_str(),
-      StrideDefine.c_str(),
       InputDataTypeDefine.c_str(),
       InputDivisorDefine.c_str(),
       AccumDataTypeDefine.c_str(),
@@ -12612,23 +12622,35 @@ float4 ps_main() : SV_Target {
       InputVectorStrideDefine.c_str(),
       NumLayersDefine.c_str(),
       BiasInterpretationEnumDefine.c_str(),
+      UseGroupsharedDefine.c_str(),
   };
 
+  std::vector<std::wstring> StrideDefines;
+  for (int I = 0; I < Config.NumLayers; ++I) {
+    auto ConvertInfo = InputMatrices[I].getConversionInfo(
+        D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout);
+    wchar_t StrideName[16];
+    swprintf(StrideName, _countof(StrideName), L"STRIDE%d", I);
+    StrideDefines.push_back(
+        CreateDefineFromInt(StrideName, ConvertInfo.DestInfo.DestStride));
+    Options.push_back(StrideDefines[I].c_str());
+  }
+
   CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
       new LinAlgHeaderIncludeHandler(m_support);
 
   if (RunCompute) {
     CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9",
-                     &PipelineState, Options, _countof(Options),
+                     &PipelineState, Options.data(), (int)Options.size(),
                      IncludeHandler);
   } else {
     CComPtr<ID3DBlob> VertexShader;
     CComPtr<ID3DBlob> PixelShader;
 
     CompileFromText(ShaderSource.c_str(), L"vs_main", L"vs_6_9", &VertexShader,
-                    Options, _countof(Options), IncludeHandler);
+                    Options.data(), (int)Options.size(), IncludeHandler);
     CompileFromText(ShaderSource.c_str(), L"ps_main", L"ps_6_9", &PixelShader,
-                    Options, _countof(Options), IncludeHandler);
+                    Options.data(), (int)Options.size(), IncludeHandler);
 
     D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {};
     // psoDesc.InputLayout;
@@ -12828,16 +12850,30 @@ float4 ps_main() : SV_Target {
     float *ResultBuffer = (float *)MappedData.data();
     bool Equal = true;
 
-    for (int i = 0; i < Config.NumThreads; ++i) {
+    float MaxError = 0.00001f;
+    if (MulProps.MatrixInterpretation ==
+        D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
+      // Allow for more error in fp16 relative to the fp32 reference
+      MaxError = 0.1f;
+    } else if (MulProps.MatrixInterpretation ==
+               D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3) {
+      // And even more error for the fp8 formats
+      MaxError = 1.0f;
+    } else if (MulProps.MatrixInterpretation ==
+               D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
+      MaxError = 3.0f;
+    }
+
+    for (int i = 0; i < Config.NumThreads && Equal; ++i) {
       for (int j = 0; j < Config.OutputPerThread; ++j) {
         float Result = ResultBuffer[i * Config.OutputPerThread + j];
         float Expected = ExpectedOutput.getVector<float>(i)[j];
         if (isnan(Result) || isnan(Expected) ||
-            fabs(Result - Expected) > 0.00001) {
-          LogErrorFmt(L"Result mismatch at index %d",
-                      i * Config.OutputPerThread + j);
+            fabs(Result - Expected) > MaxError) {
+          LogErrorFmt(L"Result mismatch at vector %d, element %d", i, j);
           LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected);
           Equal = false;
+          break;
         }
       }
     }
@@ -12938,6 +12974,8 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
     D3D12_COOPERATIVE_VECTOR_PROPERTIES_ACCUMULATE &AccumulateProps,
     CoopVecOuterProductSubtestConfig &Config, bool RunCompute) {
 
+  std::mt19937 Rnd(0x42);
+
   LogCommentFmt(
       L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s, "
       L"Stage: %s",
@@ -12996,17 +13034,17 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
     InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN,
                                                                   Config.DimM);
   } else {
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
+    WEX::Logging::Log::Comment(L"Unsupported matrix data type");
     return;
   }
 
   // Create input vectors
   auto InputVector1 = CoopVecHelpers::TestVector::createSimpleTestVector(
       Config.NumThreads, Config.DimM, AccumulateProps.InputType,
-      AccumulateProps.InputType);
+      AccumulateProps.InputType, AccumulateProps.AccumulationType, Rnd);
   auto InputVector2 = CoopVecHelpers::TestVector::createSimpleTestVector(
       Config.NumThreads, Config.DimN, AccumulateProps.InputType,
-      AccumulateProps.InputType);
+      AccumulateProps.InputType, AccumulateProps.AccumulationType, Rnd);
 
   // Calculate reference output
   auto ExpectedOutputBufferI8 =
@@ -13017,14 +13055,11 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
               ExpectedOutputBufferI8.size());
 
   if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
-    DirectX::PackedVector::HALF *InputVector1FP16 =
-        reinterpret_cast<DirectX::PackedVector::HALF *>(
-            InputVector1.getBuffer());
-    DirectX::PackedVector::HALF *InputVector2FP16 =
-        reinterpret_cast<DirectX::PackedVector::HALF *>(
-            InputVector2.getBuffer());
-
     for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      auto *InputVector1FP16 =
+          InputVector1.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
+      auto *InputVector2FP16 =
+          InputVector2.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
       for (int M = 0; M < Config.DimM; ++M) {
         for (int N = 0; N < Config.DimN; ++N) {
           float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) *
@@ -13035,20 +13070,19 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
     }
   } else if (AccumulateProps.InputType ==
              D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    float *InputVector1FP32 =
-        reinterpret_cast<float *>(InputVector1.getBuffer());
-    float *InputVector2FP32 =
-        reinterpret_cast<float *>(InputVector2.getBuffer());
-
     for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      auto *InputVector1FP32 = InputVector1.getVector<float>(ThreadIdx);
+      auto *InputVector2FP32 = InputVector2.getVector<float>(ThreadIdx);
       for (int M = 0; M < Config.DimM; ++M) {
         for (int N = 0; N < Config.DimN; ++N) {
-          float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] *
-                      InputVector2FP32[ThreadIdx * Config.DimN + N];
+          float Acc = InputVector1FP32[M] * InputVector2FP32[N];
           ExpectedOutputBuffer[M * Config.DimN + N] += Acc;
         }
       }
     }
+  } else {
+    WEX::Logging::Log::Comment(L"Unsupported input data type");
+    return;
   }
 
   // Create a compute pipeline state object.

From 00b0385458ad86a5d2cea16f601cf62074df83a1 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 10:23:17 -0400
Subject: [PATCH 14/26] Update CoopVecAPI.h with latest Agility SDK preview
 d3d12.h

---
 tools/clang/unittests/HLSLExec/CoopVecAPI.h    | 14 ++++++--------
 .../clang/unittests/HLSLExec/ExecutionTest.cpp | 18 +++++++++---------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVecAPI.h b/tools/clang/unittests/HLSLExec/CoopVecAPI.h
index 16c1105edc..563366e0bc 100644
--- a/tools/clang/unittests/HLSLExec/CoopVecAPI.h
+++ b/tools/clang/unittests/HLSLExec/CoopVecAPI.h
@@ -145,18 +145,16 @@ ID3D12DevicePreview : public IUnknown
 #endif 	/* __ID3D12DevicePreview_INTERFACE_DEFINED__ */
 
 
-#ifndef __ID3D12GraphicsCommandList11_INTERFACE_DEFINED__
-#define __ID3D12GraphicsCommandList11_INTERFACE_DEFINED__
+#ifndef __ID3D12GraphicsCommandListPreview_INTERFACE_DEFINED__
+#define __ID3D12GraphicsCommandListPreview_INTERFACE_DEFINED__
 
-EXTERN_C const IID IID_ID3D12GraphicsCommandList11;
+EXTERN_C const IID IID_ID3D12GraphicsCommandListPreview;
 
-MIDL_INTERFACE("f0dcfabc-a84a-4fe3-b3b9-eab26b306c38")
-ID3D12GraphicsCommandList11 : public ID3D12GraphicsCommandList10
+MIDL_INTERFACE("536d9bb6-9eee-4c75-86e8-e29e29e08ed3")
+ID3D12GraphicsCommandListPreview : public ID3D12GraphicsCommandList10
 {
 public:
     virtual void STDMETHODCALLTYPE Reserved0() = 0;
-    virtual void STDMETHODCALLTYPE Reserved1() = 0;
-    virtual void STDMETHODCALLTYPE Reserved2() = 0;
 
     virtual void STDMETHODCALLTYPE ConvertLinearAlgebraMatrix( 
         _In_  const D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO *pDesc,
@@ -164,7 +162,7 @@ ID3D12GraphicsCommandList11 : public ID3D12GraphicsCommandList10
     
 };
 
-#endif 	/* __ID3D12GraphicsCommandList11_INTERFACE_DEFINED__ */
+#endif 	/* __ID3D12GraphicsCommandListPreview_INTERFACE_DEFINED__ */
 
 #else // __ID3D12GraphicsCommandList10_INTERFACE_DEFINED__
 // The used d3d12.h header does not support ID3D12GraphicsCommandList10,
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index a58137a63f..f369d38ae8 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12745,10 +12745,10 @@ float4 ps_main() : SV_Target {
         InputMatrixSRVResources[I]->GetGPUVirtualAddress();
 
     // Get command list interface and perform conversion
-    CComPtr<ID3D12GraphicsCommandList11> CommandList11;
+    CComPtr<ID3D12GraphicsCommandListPreview> CommandListPreview;
     VERIFY_SUCCEEDED(CommandList->QueryInterface(
-        __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11));
-    CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
+        __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview));
+    CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
     // This increments BaseHandle
     CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t),
@@ -13376,10 +13376,10 @@ float4 ps_main() : SV_Target {
     ConvertInfo.DataDesc = DataDesc;
 
     // Get command list interface and perform conversion
-    CComPtr<ID3D12GraphicsCommandList11> CommandList11;
+    CComPtr<ID3D12GraphicsCommandListPreview> CommandListPreview;
     VERIFY_SUCCEEDED(CommandList->QueryInterface(
-        __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11));
-    CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
+        __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview));
+    CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
     // This increments baseHandle
     if ((ConvertInfo.DestInfo.DestSize % 4) != 0) {
@@ -13527,10 +13527,10 @@ float4 ps_main() : SV_Target {
         ConvertedMatrixResource->GetGPUVirtualAddress();
 
     // Get command list interface and perform conversion
-    CComPtr<ID3D12GraphicsCommandList11> CommandList11;
+    CComPtr<ID3D12GraphicsCommandListPreview> CommandListPreview;
     VERIFY_SUCCEEDED(CommandList->QueryInterface(
-        __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11));
-    CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
+        __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview));
+    CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
   }
 
   RecordTransitionBarrier(CommandList, MatrixRowMajorResource,

From e21d92d82569697b7afd1d38560c20a56a7f236d Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 12:04:50 -0400
Subject: [PATCH 15/26] clang-format

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index f369d38ae8..a06de31508 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12746,8 +12746,9 @@ float4 ps_main() : SV_Target {
 
     // Get command list interface and perform conversion
     CComPtr<ID3D12GraphicsCommandListPreview> CommandListPreview;
-    VERIFY_SUCCEEDED(CommandList->QueryInterface(
-        __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview));
+    VERIFY_SUCCEEDED(
+        CommandList->QueryInterface(__uuidof(ID3D12GraphicsCommandListPreview),
+                                    (void **)&CommandListPreview));
     CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
     // This increments BaseHandle
@@ -13377,8 +13378,9 @@ float4 ps_main() : SV_Target {
 
     // Get command list interface and perform conversion
     CComPtr<ID3D12GraphicsCommandListPreview> CommandListPreview;
-    VERIFY_SUCCEEDED(CommandList->QueryInterface(
-        __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview));
+    VERIFY_SUCCEEDED(
+        CommandList->QueryInterface(__uuidof(ID3D12GraphicsCommandListPreview),
+                                    (void **)&CommandListPreview));
     CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
     // This increments baseHandle
@@ -13528,8 +13530,9 @@ float4 ps_main() : SV_Target {
 
     // Get command list interface and perform conversion
     CComPtr<ID3D12GraphicsCommandListPreview> CommandListPreview;
-    VERIFY_SUCCEEDED(CommandList->QueryInterface(
-        __uuidof(ID3D12GraphicsCommandListPreview), (void **)&CommandListPreview));
+    VERIFY_SUCCEEDED(
+        CommandList->QueryInterface(__uuidof(ID3D12GraphicsCommandListPreview),
+                                    (void **)&CommandListPreview));
     CommandListPreview->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
   }
 

From 20b6ad76e6ced4ed7d7808b870022962fccc14f0 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 14:53:38 -0400
Subject: [PATCH 16/26] Fix w-pos in vertex shaders

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index a06de31508..18e27f849d 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12519,11 +12519,11 @@ void main(uint threadIdx : SV_GroupThreadID)
 float4 vs_main(uint vid : SV_VertexID) : SV_Position {
   switch (vid) {
   case 0:
-    return float4(-1, 1, 0, 0);
+    return float4(-1, 1, 0, 1);
   case 1:
-    return float4(3, 1, 0, 0);
+    return float4(3, 1, 0, 1);
   case 2:
-    return float4(-1, -3, 0, 0);
+    return float4(-1, -3, 0, 1);
   }
   return float4(0, 0, 0, 0);
 }
@@ -13124,11 +13124,11 @@ void main(uint threadIdx : SV_GroupThreadID)
 float4 vs_main(uint vid : SV_VertexID) : SV_Position {
   switch (vid) {
   case 0:
-    return float4(-1, 1, 0, 0);
+    return float4(-1, 1, 0, 1);
   case 1:
-    return float4(3, 1, 0, 0);
+    return float4(3, 1, 0, 1);
   case 2:
-    return float4(-1, -3, 0, 0);
+    return float4(-1, -3, 0, 1);
   }
   return float4(0, 0, 0, 0);
 }

From 6f8a4e21fa195753f082ba3d62ffd29c0484b0bf Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 14:57:36 -0400
Subject: [PATCH 17/26] Re-enable debug layer

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 18e27f849d..27159375ff 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -758,7 +758,7 @@ class ExecutionTest {
 #endif
   }
 
-  bool UseDebugIfaces() { return false; }
+  bool UseDebugIfaces() { return true; }
 
   bool SaveImages() { return GetTestParamBool(L"SaveImages"); }
 

From 9d171ed20354ff261af69f7b33b95cd53a98e9d6 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 15:15:56 -0400
Subject: [PATCH 18/26] style nits

---
 tools/clang/unittests/HLSLExec/CoopVec.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index bbcc5b8b96..a354c0b4cb 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -34,9 +34,10 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler {
               L"LinAlgHeader", ParamValue))) {
         return E_FAIL;
       }
-      if (ParamValue.IsEmpty()) {
+
+      if (ParamValue.IsEmpty())
         return E_FAIL;
-      }
+
       LPCWSTR RealHeaderPath =
           reinterpret_cast<LPCWSTR>(ParamValue.GetBuffer());
 
@@ -382,15 +383,12 @@ struct TestVector {
              size_t Alignment = 16)
       : NumVectors(NumVectors), VectorSize(VectorSize),
         ElementSize(ElementSize) {
-    if (NumVectors == 0) {
+    if (NumVectors == 0)
       throw std::invalid_argument("NumVectors must be greater than 0");
-    }
-    if (VectorSize == 0) {
+    if (VectorSize == 0)
       throw std::invalid_argument("VectorSize must be greater than 0");
-    }
-    if (ElementSize == 0) {
+    if (ElementSize == 0)
       throw std::invalid_argument("ElementSize must be greater than 0");
-    }
 
     size_t VectorBytes = VectorSize * ElementSize;
     Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment;
@@ -488,9 +486,8 @@ struct TestVector {
       Buffer = reinterpret_cast<uint8_t *>(Ptr);
 
       // Copy data
-      if (other.Buffer) {
+      if (other.Buffer)
         std::memcpy(Buffer, other.Buffer, TotalBytes);
-      }
     }
     return *this;
   }

From b7bc46b5ea518974a8ea356853ca946575f8ae16 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 15:51:56 -0400
Subject: [PATCH 19/26] Fix up some uses of int and uint32_t that should be
 size_t

---
 tools/clang/unittests/HLSLExec/CoopVec.h      |  32 ++--
 .../unittests/HLSLExec/ExecutionTest.cpp      | 142 +++++++++---------
 2 files changed, 90 insertions(+), 84 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index a354c0b4cb..689a4f214f 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -67,10 +67,10 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler {
 namespace CoopVecHelpers {
 
 template <typename EltTy>
-static std::vector<uint8_t> CreateAllOnesInputMatrix(uint32_t Width,
-                                                     uint32_t Height) {
+static std::vector<uint8_t> CreateAllOnesInputMatrix(size_t Width,
+                                                     size_t Height) {
   std::vector<EltTy> InputMatrix(Width * Height);
-  for (uint32_t i = 0; i < Width * Height; i++) {
+  for (size_t i = 0; i < Width * Height; i++) {
     if constexpr (std::is_same_v<EltTy, uint8_t> ||
                   std::is_same_v<EltTy, int8_t>) {
       InputMatrix[i] = 1;
@@ -92,15 +92,15 @@ static std::vector<uint8_t> CreateAllOnesInputMatrix(uint32_t Width,
 }
 
 template <typename EltTy>
-static std::vector<uint8_t> CreateInputVector(uint32_t NumThreads,
-                                              uint32_t EltsPerThread) {
+static std::vector<uint8_t> CreateInputVector(size_t NumThreads,
+                                              size_t EltsPerThread) {
   std::vector<EltTy> InputVector(NumThreads * EltsPerThread);
   std::fill(InputVector.begin(), InputVector.end(), EltTy(0));
   if (EltsPerThread < 2) {
     WEX::Logging::Log::Error(L"EltsPerThread must be at least 2");
     return std::vector<uint8_t>();
   }
-  for (uint32_t TID = 0; TID < NumThreads; TID++) {
+  for (size_t TID = 0; TID < NumThreads; TID++) {
     if constexpr (std::is_same_v<EltTy, uint8_t> ||
                   std::is_same_v<EltTy, int8_t>) {
       InputVector[TID * EltsPerThread + 0] = 1;
@@ -125,7 +125,7 @@ static std::vector<uint8_t> CreateInputVector(uint32_t NumThreads,
 }
 
 template <typename EltTy>
-static std::vector<uint8_t> CreateInputBias(uint32_t NumElts) {
+static std::vector<uint8_t> CreateInputBias(size_t NumElts) {
   std::vector<EltTy> InputBias(NumElts);
   if constexpr (std::is_same_v<EltTy, uint8_t> ||
                 std::is_same_v<EltTy, int8_t>) {
@@ -248,7 +248,7 @@ static std::wstring MatrixLayoutToHlslLayoutString(
 
 // This multiplier is used to compute the row/column stride for a matrix
 // given it's element size.
-static int
+static size_t
 GetStrideMultiplierForMatrixDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
   switch (DataType) {
   case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED:
@@ -271,7 +271,7 @@ GetStrideMultiplierForMatrixDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
   }
 }
 
-static int GetNumPackedElementsForInputDataType(
+static size_t GetNumPackedElementsForInputDataType(
     D3D12_LINEAR_ALGEBRA_DATATYPE InputInterpretation) {
   // Int8 packed types are the only ones that have more than 1 element per
   // shader variable
@@ -724,7 +724,7 @@ struct TestVector {
 
     // Create destination matrix info
     ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver
-    int DestEltSize = 0;
+    UINT DestEltSize = 0;
     switch (DestDataType) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED:
@@ -798,14 +798,14 @@ struct TestVector {
                          sizeof(float));
 
     if (IsMatrixFP32) {
-      for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
+      for (size_t VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
         const DirectX::PackedVector::HALF *InputBiasFP16 =
             Bias.getVector<DirectX::PackedVector::HALF>(0);
-        for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
+        for (size_t OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
              ++OutputIdx) {
           float Acc = 0;
 
-          for (int InputIdx = 0; InputIdx < Matrix.getVectorSize();
+          for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize();
                ++InputIdx) {
             float InputElem;
             if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
@@ -829,13 +829,13 @@ struct TestVector {
         }
       }
     } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
-      for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
+      for (size_t VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
         const int32_t *InputBiasI32 = Bias.getVector<int32_t>(0);
-        for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
+        for (size_t OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
              ++OutputIdx) {
           int Acc = 0;
 
-          for (int InputIdx = 0; InputIdx < Matrix.getVectorSize();
+          for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize();
                ++InputIdx) {
             int InputElem;
             if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 27159375ff..afdde90029 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -786,10 +786,10 @@ class ExecutionTest {
 
 #if HAVE_COOPVEC_API
   struct CoopVecMulSubtestConfig {
-    int InputPerThread;
-    int OutputPerThread;
-    int NumThreads;
-    int NumLayers;
+    size_t InputPerThread;
+    size_t OutputPerThread;
+    size_t NumThreads;
+    size_t NumLayers;
     D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout;
     bool Bias;
   };
@@ -802,9 +802,9 @@ class ExecutionTest {
                             CoopVecMulSubtestConfig &Config, bool RunCompute);
 
   struct CoopVecOuterProductSubtestConfig {
-    int DimM; // Row Count
-    int DimN; // Column Count
-    int NumThreads;
+    size_t DimM; // Row Count
+    size_t DimN; // Column Count
+    size_t NumThreads;
     D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout;
   };
 
@@ -12358,13 +12358,15 @@ void ExecutionTest::runCoopVecMulSubtest(
       CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(),
       RunCompute ? L"Compute" : L"Pixel");
 
-  const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4);
+  const size_t OutputBufferSize =
+      (Config.OutputPerThread * Config.NumThreads * 4);
 
   // Create root signature with a single root entry for all SRVs and UAVs
   CComPtr<ID3D12RootSignature> RootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE Ranges[2];
-    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0,
+    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + (UINT)Config.NumLayers,
+                   0,
                    0); // InputVector, InputBias, InputMatrices[]
     Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer
 
@@ -12385,7 +12387,7 @@ void ExecutionTest::runCoopVecMulSubtest(
   {
     D3D12_DESCRIPTOR_HEAP_DESC Desc = {};
     Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-    Desc.NumDescriptors = 3 + Config.NumLayers;
+    Desc.NumDescriptors = 3 + (UINT)Config.NumLayers;
     Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
     VERIFY_SUCCEEDED(
         D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap)));
@@ -12396,7 +12398,7 @@ void ExecutionTest::runCoopVecMulSubtest(
   // Our input matrix is really a set of row vectors, which we can represent
   // as a TestVector.
   std::vector<::CoopVecHelpers::TestVector> InputMatrices;
-  for (int I = 0; I < Config.NumLayers - 1; ++I) {
+  for (size_t I = 0; I < Config.NumLayers - 1; ++I) {
     // Each layer except the last is InputPerThread x InputPerThread
     InputMatrices.push_back(
         ::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
@@ -12418,7 +12420,7 @@ void ExecutionTest::runCoopVecMulSubtest(
 
   // Calculate reference output
   auto ExpectedOutput = InputVector;
-  for (int I = 0; I < Config.NumLayers; ++I) {
+  for (size_t I = 0; I < Config.NumLayers; ++I) {
     ExpectedOutput = ::CoopVecHelpers::TestVector::matrixVectorMultiply(
         InputMatrices[I], ExpectedOutput, InputBias, Config.Bias,
         MulProps.MatrixInterpretation,
@@ -12543,7 +12545,7 @@ float4 ps_main() : SV_Target {
 }
 )";
 
-  auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
+  auto CreateDefineFromSize = [](const wchar_t *Name, size_t Value) {
     std::wstringstream Stream;
     Stream << L"-D" << Name << L"=" << Value;
     return Stream.str();
@@ -12558,8 +12560,9 @@ float4 ps_main() : SV_Target {
 
   const std::wstring HlslMatrixLayout =
       CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout);
-  const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType(
-      MulProps.InputInterpretation);
+  const size_t InputDivisor =
+      CoopVecHelpers::GetNumPackedElementsForInputDataType(
+          MulProps.InputInterpretation);
   const std::wstring InputDataType =
       CoopVecHelpers::GetHlslDataTypeForDataType(MulProps.InputType);
   const std::wstring AccumDataType =
@@ -12575,14 +12578,14 @@ float4 ps_main() : SV_Target {
           MulProps.BiasInterpretation);
 
   auto InputPerThreadDefine =
-      CreateDefineFromInt(L"INPUT_PER_THREAD", Config.InputPerThread);
+      CreateDefineFromSize(L"INPUT_PER_THREAD", Config.InputPerThread);
   auto OutputPerThreadDefine =
-      CreateDefineFromInt(L"OUTPUT_PER_THREAD", Config.OutputPerThread);
+      CreateDefineFromSize(L"OUTPUT_PER_THREAD", Config.OutputPerThread);
   auto NumThreadsDefine =
-      CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads);
+      CreateDefineFromSize(L"NUM_THREADS", Config.NumThreads);
   auto InputDataTypeDefine =
       CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType);
-  auto InputDivisorDefine = CreateDefineFromInt(
+  auto InputDivisorDefine = CreateDefineFromSize(
       L"INPUT_VECTOR_NUM_ELEMENTS",
       (Config.InputPerThread + InputDivisor - 1) / InputDivisor);
   auto AccumDataTypeDefine =
@@ -12593,18 +12596,18 @@ float4 ps_main() : SV_Target {
       CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout);
   auto MatrixDataTypeEnumDefine =
       CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum);
-  auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0);
+  auto UseBiasDefine = CreateDefineFromSize(L"USE_BIAS", Config.Bias ? 1 : 0);
   // Treat the accumulator interpretation the same as the input interpretation
   // for the purposes of MakeInterpretedVector.
   auto AccumInterpretationEnumDefine = CreateDefineFromString(
       L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum);
   auto InputVectorStrideDefine =
-      CreateDefineFromInt(L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride());
-  auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers);
+      CreateDefineFromSize(L"INPUT_VECTOR_STRIDE", InputVector.getStride());
+  auto NumLayersDefine = CreateDefineFromSize(L"NUM_LAYERS", Config.NumLayers);
   auto BiasInterpretationEnumDefine = CreateDefineFromString(
       L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum);
   auto UseGroupsharedDefine =
-      CreateDefineFromInt(L"USE_GROUPSHARED", RunCompute ? 1 : 0);
+      CreateDefineFromSize(L"USE_GROUPSHARED", RunCompute ? 1 : 0);
 
   std::vector<LPCWSTR> Options = {
       L"-enable-16bit-types",
@@ -12626,13 +12629,13 @@ float4 ps_main() : SV_Target {
   };
 
   std::vector<std::wstring> StrideDefines;
-  for (int I = 0; I < Config.NumLayers; ++I) {
+  for (size_t I = 0; I < Config.NumLayers; ++I) {
     auto ConvertInfo = InputMatrices[I].getConversionInfo(
         D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout);
     wchar_t StrideName[16];
-    swprintf(StrideName, _countof(StrideName), L"STRIDE%d", I);
+    swprintf(StrideName, _countof(StrideName), L"STRIDE%zu", I);
     StrideDefines.push_back(
-        CreateDefineFromInt(StrideName, ConvertInfo.DestInfo.DestStride));
+        CreateDefineFromSize(StrideName, ConvertInfo.DestInfo.DestStride));
     Options.push_back(StrideDefines[I].c_str());
   }
 
@@ -12688,7 +12691,7 @@ float4 ps_main() : SV_Target {
       Config.NumLayers);
   std::vector<CComPtr<ID3D12Resource>> InputMatrixSRVUploadResources(
       Config.NumLayers);
-  for (int I = 0; I < Config.NumLayers; ++I) {
+  for (size_t I = 0; I < Config.NumLayers; ++I) {
     CreateTestResources(
         D3DDevice, CommandList, InputMatrices[I].getBuffer(),
         InputMatrices[I].getTotalBytes(),
@@ -12727,11 +12730,11 @@ float4 ps_main() : SV_Target {
   // Create converted matrix resource and SRV for each input matrix
   std::vector<CComPtr<ID3D12Resource>> ConvertedMatrixResources(
       Config.NumLayers);
-  for (int I = 0; I < Config.NumLayers; ++I) {
+  for (size_t I = 0; I < Config.NumLayers; ++I) {
     auto ConvertInfo = InputMatrices[I].getConversionInfo(
         D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout);
 
-    int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16;
+    UINT SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16;
 
     // Create resource to hold matrix copy
     CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize,
@@ -12776,7 +12779,7 @@ float4 ps_main() : SV_Target {
   CreateTestUavs(D3DDevice, CommandList, OutputBufferInit.data(),
                  OutputBufferSize, &UavResource, &UavUploadResource,
                  &UavReadResource);
-  CreateRawUAV(D3DDevice, BaseHandle, OutputBufferSize / 4, UavResource);
+  CreateRawUAV(D3DDevice, BaseHandle, (UINT)OutputBufferSize / 4, UavResource);
 
   CommandList->Close();
   ExecuteCommandList(CommandQueue, CommandList);
@@ -12846,7 +12849,7 @@ float4 ps_main() : SV_Target {
   WaitForSignal(CommandQueue, FO);
 
   {
-    MappedData MappedData(UavReadResource, OutputBufferSize);
+    MappedData MappedData(UavReadResource, (UINT)OutputBufferSize);
 
     float *ResultBuffer = (float *)MappedData.data();
     bool Equal = true;
@@ -12865,8 +12868,8 @@ float4 ps_main() : SV_Target {
       MaxError = 3.0f;
     }
 
-    for (int i = 0; i < Config.NumThreads && Equal; ++i) {
-      for (int j = 0; j < Config.OutputPerThread; ++j) {
+    for (size_t i = 0; i < Config.NumThreads && Equal; ++i) {
+      for (size_t j = 0; j < Config.OutputPerThread; ++j) {
         float Result = ResultBuffer[i * Config.OutputPerThread + j];
         float Expected = ExpectedOutput.getVector<float>(i)[j];
         if (isnan(Result) || isnan(Expected) ||
@@ -13056,13 +13059,13 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
               ExpectedOutputBufferI8.size());
 
   if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+    for (size_t ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
       auto *InputVector1FP16 =
           InputVector1.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
       auto *InputVector2FP16 =
           InputVector2.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
-      for (int M = 0; M < Config.DimM; ++M) {
-        for (int N = 0; N < Config.DimN; ++N) {
+      for (size_t M = 0; M < Config.DimM; ++M) {
+        for (size_t N = 0; N < Config.DimN; ++N) {
           float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) *
                       ConvertFloat16ToFloat32(InputVector2FP16[N]);
           ExpectedOutputBuffer[M * Config.DimN + N] += acc;
@@ -13071,11 +13074,11 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
     }
   } else if (AccumulateProps.InputType ==
              D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+    for (size_t ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
       auto *InputVector1FP32 = InputVector1.getVector<float>(ThreadIdx);
       auto *InputVector2FP32 = InputVector2.getVector<float>(ThreadIdx);
-      for (int M = 0; M < Config.DimM; ++M) {
-        for (int N = 0; N < Config.DimN; ++N) {
+      for (size_t M = 0; M < Config.DimM; ++M) {
+        for (size_t N = 0; N < Config.DimN; ++N) {
           float Acc = InputVector1FP32[M] * InputVector2FP32[N];
           ExpectedOutputBuffer[M * Config.DimN + N] += Acc;
         }
@@ -13142,7 +13145,7 @@ float4 ps_main() : SV_Target {
 }
 )";
 
-  auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
+  auto CreateDefineFromSize = [](const wchar_t *Name, size_t Value) {
     std::wstringstream Stream;
     Stream << L"-D" << Name << L"=" << Value;
     return Stream.str();
@@ -13154,11 +13157,12 @@ float4 ps_main() : SV_Target {
     return Stream.str();
   };
 
-  int Stride = 0;
+  size_t Stride = 0;
   const std::wstring HlslMatrixLayout =
       CoopVecHelpers::MatrixLayoutToHlslLayoutString(Config.MatrixLayout);
-  int StrideMultiplier = CoopVecHelpers::GetStrideMultiplierForMatrixDataType(
-      AccumulateProps.AccumulationType);
+  size_t StrideMultiplier =
+      CoopVecHelpers::GetStrideMultiplierForMatrixDataType(
+          AccumulateProps.AccumulationType);
   switch (Config.MatrixLayout) {
   case D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR:
     Stride = Config.DimN * StrideMultiplier;
@@ -13168,8 +13172,9 @@ float4 ps_main() : SV_Target {
     break;
   }
 
-  const int InputDivisor = CoopVecHelpers::GetNumPackedElementsForInputDataType(
-      AccumulateProps.InputType);
+  const size_t InputDivisor =
+      CoopVecHelpers::GetNumPackedElementsForInputDataType(
+          AccumulateProps.InputType);
   const std::wstring InputDataType =
       CoopVecHelpers::GetHlslDataTypeForDataType(AccumulateProps.InputType);
   const std::wstring AccumDataType = CoopVecHelpers::GetHlslDataTypeForDataType(
@@ -13181,14 +13186,15 @@ float4 ps_main() : SV_Target {
       CoopVecHelpers::GetHlslInterpretationForDataType(
           AccumulateProps.InputType);
 
-  auto DimMDefine = CreateDefineFromInt(L"DIM_M", Config.DimM);
-  auto DimNDefine = CreateDefineFromInt(L"DIM_N", Config.DimN);
+  auto DimMDefine = CreateDefineFromSize(L"DIM_M", Config.DimM);
+  auto DimNDefine = CreateDefineFromSize(L"DIM_N", Config.DimN);
   auto NumThreadsDefine =
-      CreateDefineFromInt(L"NUM_THREADS", Config.NumThreads);
-  auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
+      CreateDefineFromSize(L"NUM_THREADS", Config.NumThreads);
+  auto StrideDefine = CreateDefineFromSize(L"STRIDE", Stride);
   auto InputDataTypeDefine =
       CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType.c_str());
-  auto InputDivisorDefine = CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor);
+  auto InputDivisorDefine =
+      CreateDefineFromSize(L"INPUT_DIVISOR", InputDivisor);
   auto AccumDataTypeDefine =
       CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType.c_str());
   auto InputInterpretationEnumDefine = CreateDefineFromString(
@@ -13197,10 +13203,10 @@ float4 ps_main() : SV_Target {
       CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str());
   auto MatrixDataTypeEnumDefine = CreateDefineFromString(
       L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str());
-  auto InputVector1StrideDefine = CreateDefineFromInt(
-      L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride());
-  auto InputVector2StrideDefine = CreateDefineFromInt(
-      L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride());
+  auto InputVector1StrideDefine =
+      CreateDefineFromSize(L"INPUT_VECTOR_1_STRIDE", InputVector1.getStride());
+  auto InputVector2StrideDefine =
+      CreateDefineFromSize(L"INPUT_VECTOR_2_STRIDE", InputVector2.getStride());
 
   LPCWSTR Options[] = {
       L"-enable-16bit-types",
@@ -13295,7 +13301,7 @@ float4 ps_main() : SV_Target {
                InputVecSRVResource2);
 
   CComPtr<ID3D12Resource> ConvertedMatrixResource, ConvertedMatrixReadResource;
-  int ConvertedMatrixSize = 0;
+  UINT ConvertedMatrixSize = 0;
   {
     // Create source matrix info
     D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_SRC_INFO SrcInfo = {};
@@ -13306,8 +13312,8 @@ float4 ps_main() : SV_Target {
     // Create destination matrix info
     D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_DEST_INFO DestInfo = {};
     DestInfo.DestSize = 0; // Will be populated by driver
-    int SrcEltSize = 0;
-    int DestEltSize = 0;
+    UINT SrcEltSize = 0;
+    UINT DestEltSize = 0;
     switch (AccumulateProps.AccumulationType) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED:
@@ -13331,19 +13337,19 @@ float4 ps_main() : SV_Target {
       DestEltSize = 1; // FP8
       break;
     }
-    SrcInfo.SrcStride = Config.DimM * SrcEltSize;
-    SrcInfo.SrcSize = Config.DimM * Config.DimN * SrcEltSize;
+    SrcInfo.SrcStride = (UINT)(Config.DimM * SrcEltSize);
+    SrcInfo.SrcSize = (UINT)(Config.DimM * Config.DimN * SrcEltSize);
 
     DestInfo.DestLayout = Config.MatrixLayout;
     DestInfo.DestStride = 0;
-    DestInfo.NumRows = Config.DimM;
-    DestInfo.NumColumns = Config.DimN;
+    DestInfo.NumRows = (UINT)Config.DimM;
+    DestInfo.NumColumns = (UINT)Config.DimN;
 
     if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
-      DestInfo.DestStride = Config.DimM * DestEltSize;
+      DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize);
     } else if (Config.MatrixLayout ==
                D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
-      DestInfo.DestStride = Config.DimM * DestEltSize;
+      DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize);
     }
 
     // Create conversion info
@@ -13483,8 +13489,8 @@ float4 ps_main() : SV_Target {
     ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver
     ConvertInfo.DestInfo.DestLayout =
         D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
-    ConvertInfo.DestInfo.NumRows = Config.DimM;
-    ConvertInfo.DestInfo.NumColumns = Config.DimN;
+    ConvertInfo.DestInfo.NumRows = (UINT)Config.DimM;
+    ConvertInfo.DestInfo.NumColumns = (UINT)Config.DimN;
 
     if (AccumulateProps.AccumulationType ==
             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32 ||
@@ -13495,10 +13501,10 @@ float4 ps_main() : SV_Target {
         AccumulateProps.AccumulationType ==
             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
       ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32;
-      ConvertInfo.DestInfo.DestStride = Config.DimN * sizeof(float);
+      ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(float));
     } else {
       ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8;
-      ConvertInfo.DestInfo.DestStride = Config.DimN * sizeof(int8_t);
+      ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(int8_t));
     }
 
     // Get destination size using preview interface
@@ -13549,7 +13555,7 @@ float4 ps_main() : SV_Target {
 
     float *ResultBuffer = (float *)MappedData.data();
     bool Equal = true;
-    for (int i = 0; i < (UINT)InputMatrix.size() / sizeof(float); i++) {
+    for (size_t i = 0; i < (UINT)InputMatrix.size() / sizeof(float); i++) {
       if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) ||
           fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) {
         LogErrorFmt(L"Result mismatch at index %d", i);

From ed3744a6ce86a6c23a38b1d464b5cb2cfb737e9b Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 16:49:20 -0400
Subject: [PATCH 20/26] Address style/format

---
 tools/clang/unittests/HLSLExec/CoopVec.h      | 152 ++++++++++--------
 .../unittests/HLSLExec/ExecutionTest.cpp      |  97 +++++------
 2 files changed, 133 insertions(+), 116 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index 689a4f214f..23a8dae170 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -369,7 +369,62 @@ bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
          DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32;
 }
 
-struct TestVector {
+static size_t
+GetVectorElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataType,
+                     D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+  switch (DataType) {
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+    return sizeof(int8_t);
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+    return sizeof(int16_t);
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+    if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+        DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) {
+      return sizeof(int8_t);
+    } else {
+      return sizeof(int32_t);
+    }
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+    return sizeof(DirectX::PackedVector::HALF);
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+    return sizeof(float);
+  default:
+    throw std::invalid_argument("Unsupported data type");
+  }
+}
+
+static size_t
+GetMatrixElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+  switch (DataInterpretation) {
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+    // The CPU reference matrix is always int8 for all integer
+    // interpretations. The GPU version will be converted to the destination
+    // format by ConvertLinearAlgebraMatrix.
+    return sizeof(int8_t);
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+  case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+    // The CPU reference matrix is always FP32 for all FP interpretations.
+    // The GPU version will be converted to the destination format by
+    // ConvertLinearAlgebraMatrix.
+    return sizeof(float);
+  default:
+    throw std::invalid_argument("Unsupported data type");
+  }
+}
+
+class TestVector {
 private:
   size_t NumVectors = 0;
   size_t VectorSize = 0;
@@ -390,7 +445,7 @@ struct TestVector {
     if (ElementSize == 0)
       throw std::invalid_argument("ElementSize must be greater than 0");
 
-    size_t VectorBytes = VectorSize * ElementSize;
+    const size_t VectorBytes = VectorSize * ElementSize;
     Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment;
     TotalBytes = Stride * NumVectors;
 
@@ -550,22 +605,21 @@ struct TestVector {
         if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF> ||
                       std::is_same_v<T, float>) {
           float Elt = 0.0f;
-          if (IsIntegralDataType(MatrixInterpretation)) {
+
+          if (IsIntegralDataType(MatrixInterpretation))
             Elt = (float)(Rnd() & 0x7) - 3.0f;
-          } else {
+          else
             Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f;
-          }
-          if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
+
+          if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>)
             Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(Elt));
-          } else {
+          else
             Vec[J] = static_cast<T>(Elt);
-          }
         } else {
-          if constexpr (std::is_signed_v<T>) {
+          if constexpr (std::is_signed_v<T>)
             Vec[J] = static_cast<T>((int32_t)(Rnd() & 0xf) - 8);
-          } else {
+          else
             Vec[J] = static_cast<T>((uint32_t)(Rnd() & 0xf));
-          }
         }
     }
   }
@@ -596,36 +650,9 @@ struct TestVector {
                          D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation,
                          D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation,
                          std::mt19937 &Rnd) {
-    size_t ElementSize;
-    switch (DataType) {
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
-      ElementSize = sizeof(int8_t);
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
-      ElementSize = sizeof(int16_t);
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
-      if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-          DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) {
-        ElementSize = sizeof(int8_t);
-      } else {
-        ElementSize = sizeof(int32_t);
-      }
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
-      ElementSize = sizeof(DirectX::PackedVector::HALF);
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
-      ElementSize = sizeof(float);
-      break;
-    default:
-      throw std::invalid_argument("Unsupported data type");
-    }
+    const size_t ElementSize =
+        ::CoopVecHelpers::GetVectorElementSize(DataType, DataInterpretation);
+
     TestVector Vec(NumVectors, VectorSize, ElementSize);
     switch (DataType) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
@@ -670,25 +697,9 @@ struct TestVector {
   createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize,
                           D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation,
                           std::mt19937 &Rnd) {
-    size_t ElementSize;
-    switch (DataInterpretation) {
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
-      ElementSize = sizeof(int8_t);
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
-      ElementSize = sizeof(float);
-      break;
-    default:
-      throw std::invalid_argument("Unsupported data type");
-    }
+    const size_t ElementSize =
+        ::CoopVecHelpers::GetMatrixElementSize(DataInterpretation);
+
     TestVector Vec(NumVectors, VectorSize, ElementSize);
     switch (DataInterpretation) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
@@ -808,21 +819,20 @@ struct TestVector {
           for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize();
                ++InputIdx) {
             float InputElem;
-            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32)
               InputElem = InputVector.getVector<float>(VecIdx)[InputIdx];
-            } else {
+            else
               InputElem = ConvertFloat16ToFloat32(
                   InputVector.getVector<DirectX::PackedVector::HALF>(
                       VecIdx)[InputIdx]);
-            }
+
             float const MatrixElem =
                 Matrix.getVector<float>(OutputIdx)[InputIdx];
             Acc += InputElem * MatrixElem;
           }
 
-          if (HasBias) {
+          if (HasBias)
             Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
-          }
 
           float Result = Acc;
           ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
@@ -838,19 +848,19 @@ struct TestVector {
           for (size_t InputIdx = 0; InputIdx < Matrix.getVectorSize();
                ++InputIdx) {
             int InputElem;
-            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-              InputElem = (int)InputVector.getVector<float>(VecIdx)[InputIdx];
-            } else {
+            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32)
+              InputElem = static_cast<int>(
+                  InputVector.getVector<float>(VecIdx)[InputIdx]);
+            else
               InputElem = InputVector.getVector<int8_t>(VecIdx)[InputIdx];
-            }
+
             int const MatrixElem =
                 Matrix.getVector<int8_t>(OutputIdx)[InputIdx];
             Acc += InputElem * MatrixElem;
           }
 
-          if (HasBias) {
+          if (HasBias)
             Acc += InputBiasI32[OutputIdx];
-          }
 
           float Result = float(Acc);
           ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index afdde90029..f3dc75395e 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12013,9 +12013,9 @@ void ExecutionTest::runCoopVecMulTest() {
 #else
   // Create device and verify coopvec support
   CComPtr<ID3D12Device> D3DDevice;
-  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) {
+  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9))
     return;
-  }
+
   if (!DoesDeviceSupportCooperativeVector(D3DDevice)) {
     WEX::Logging::Log::Comment(
         "Device does not support cooperative vector. Skipping.");
@@ -12351,8 +12351,9 @@ void ExecutionTest::runCoopVecMulSubtest(
   std::mt19937 Rnd(0x42);
 
   LogCommentFmt(
-      L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: "
-      L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s, Stage: %s",
+      L"Running test for InputPerThread: %zu, OutputPerThread: %zu, "
+      L"NumThreads: "
+      L"%zu, NumLayers: %zu, Bias: %s, MatrixLayout: %s, Stage: %s",
       Config.InputPerThread, Config.OutputPerThread, Config.NumThreads,
       Config.NumLayers, Config.Bias ? L"true" : L"false",
       CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(),
@@ -12365,8 +12366,8 @@ void ExecutionTest::runCoopVecMulSubtest(
   CComPtr<ID3D12RootSignature> RootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE Ranges[2];
-    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + (UINT)Config.NumLayers,
-                   0,
+    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV,
+                   2 + static_cast<UINT>(Config.NumLayers), 0,
                    0); // InputVector, InputBias, InputMatrices[]
     Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer
 
@@ -12387,7 +12388,7 @@ void ExecutionTest::runCoopVecMulSubtest(
   {
     D3D12_DESCRIPTOR_HEAP_DESC Desc = {};
     Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-    Desc.NumDescriptors = 3 + (UINT)Config.NumLayers;
+    Desc.NumDescriptors = 3 + static_cast<UINT>(Config.NumLayers);
     Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
     VERIFY_SUCCEEDED(
         D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap)));
@@ -12656,7 +12657,6 @@ float4 ps_main() : SV_Target {
                     Options.data(), (int)Options.size(), IncludeHandler);
 
     D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {};
-    // psoDesc.InputLayout;
     PsoDesc.pRootSignature = RootSignature;
     PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader);
     PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader);
@@ -12711,7 +12711,7 @@ float4 ps_main() : SV_Target {
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector.getTotalBytes() / sizeof(int32_t)),
+               static_cast<UINT>(InputVector.getTotalBytes() / sizeof(int32_t)),
                InputVecSRVResource);
 
   // Create input bias
@@ -12724,7 +12724,7 @@ float4 ps_main() : SV_Target {
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)),
+               static_cast<UINT>(InputBias.getTotalBytes() / sizeof(int32_t)),
                InputBiasSRVResource);
 
   // Create converted matrix resource and SRV for each input matrix
@@ -12779,7 +12779,8 @@ float4 ps_main() : SV_Target {
   CreateTestUavs(D3DDevice, CommandList, OutputBufferInit.data(),
                  OutputBufferSize, &UavResource, &UavUploadResource,
                  &UavReadResource);
-  CreateRawUAV(D3DDevice, BaseHandle, (UINT)OutputBufferSize / 4, UavResource);
+  CreateRawUAV(D3DDevice, BaseHandle, static_cast<UINT>(OutputBufferSize / 4),
+               UavResource);
 
   CommandList->Close();
   ExecuteCommandList(CommandQueue, CommandList);
@@ -12813,12 +12814,12 @@ float4 ps_main() : SV_Target {
     D3D12_RECT ScissorRect;
 
     memset(&Viewport, 0, sizeof(Viewport));
-    Viewport.Height = (float)RtDesc.Height;
-    Viewport.Width = (float)RtDesc.Width;
+    Viewport.Height = static_cast<float>(RtDesc.Height);
+    Viewport.Width = static_cast<float>(RtDesc.Width);
     Viewport.MaxDepth = 1.0f;
     memset(&ScissorRect, 0, sizeof(ScissorRect));
-    ScissorRect.right = (long)RtDesc.Width;
-    ScissorRect.bottom = RtDesc.Height;
+    ScissorRect.right = static_cast<LONG>(RtDesc.Width);
+    ScissorRect.bottom = static_cast<LONG>(RtDesc.Height);
     CommandList->SetGraphicsRootSignature(RootSignature);
     CommandList->SetGraphicsRootDescriptorTable(0, ResHandle);
     CommandList->SetGraphicsRootUnorderedAccessView(
@@ -12849,9 +12850,9 @@ float4 ps_main() : SV_Target {
   WaitForSignal(CommandQueue, FO);
 
   {
-    MappedData MappedData(UavReadResource, (UINT)OutputBufferSize);
+    MappedData MappedData(UavReadResource, static_cast<UINT>(OutputBufferSize));
 
-    float *ResultBuffer = (float *)MappedData.data();
+    float *ResultBuffer = reinterpret_cast<float *>(MappedData.data());
     bool Equal = true;
 
     float MaxError = 0.00001f;
@@ -12874,7 +12875,7 @@ float4 ps_main() : SV_Target {
         float Expected = ExpectedOutput.getVector<float>(i)[j];
         if (isnan(Result) || isnan(Expected) ||
             fabs(Result - Expected) > MaxError) {
-          LogErrorFmt(L"Result mismatch at vector %d, element %d", i, j);
+          LogErrorFmt(L"Result mismatch at vector %zu, element %zu", i, j);
           LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected);
           Equal = false;
           break;
@@ -12901,9 +12902,9 @@ void ExecutionTest::runCoopVecOuterProductTest() {
 #else
   // Create device and verify coopvec support
   CComPtr<ID3D12Device> D3DDevice;
-  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) {
+  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9))
     return;
-  }
+
   if (!DoesDeviceSupportCooperativeVector(D3DDevice)) {
     WEX::Logging::Log::Comment(
         "Device does not support cooperative vector. Skipping.");
@@ -12981,7 +12982,8 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
   std::mt19937 Rnd(0x42);
 
   LogCommentFmt(
-      L"Running test for DimM: %d, DimN: %d, NumThreads: %d, MatrixLayout: %s, "
+      L"Running test for DimM: %zu, DimN: %zu, NumThreads: %zu, MatrixLayout: "
+      L"%s, "
       L"Stage: %s",
       Config.DimM, Config.DimN, Config.NumThreads,
       CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str(),
@@ -13241,7 +13243,6 @@ float4 ps_main() : SV_Target {
                     Options, _countof(Options), IncludeHandler);
 
     D3D12_GRAPHICS_PIPELINE_STATE_DESC PsoDesc = {};
-    // psoDesc.InputLayout;
     PsoDesc.pRootSignature = RootSignature;
     PsoDesc.VS = CD3DX12_SHADER_BYTECODE(VertexShader);
     PsoDesc.PS = CD3DX12_SHADER_BYTECODE(PixelShader);
@@ -13293,12 +13294,14 @@ float4 ps_main() : SV_Target {
       &InputVecSRVResource2, &InputVecSRVUploadResource2);
 
   // This increments baseHandle
-  CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector1.getTotalBytes() / sizeof(int32_t)),
-               InputVecSRVResource1);
-  CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector2.getTotalBytes() / sizeof(int32_t)),
-               InputVecSRVResource2);
+  CreateRawSRV(
+      D3DDevice, BaseHandle,
+      static_cast<UINT>(InputVector1.getTotalBytes() / sizeof(int32_t)),
+      InputVecSRVResource1);
+  CreateRawSRV(
+      D3DDevice, BaseHandle,
+      static_cast<UINT>(InputVector2.getTotalBytes() / sizeof(int32_t)),
+      InputVecSRVResource2);
 
   CComPtr<ID3D12Resource> ConvertedMatrixResource, ConvertedMatrixReadResource;
   UINT ConvertedMatrixSize = 0;
@@ -13337,19 +13340,19 @@ float4 ps_main() : SV_Target {
       DestEltSize = 1; // FP8
       break;
     }
-    SrcInfo.SrcStride = (UINT)(Config.DimM * SrcEltSize);
-    SrcInfo.SrcSize = (UINT)(Config.DimM * Config.DimN * SrcEltSize);
+    SrcInfo.SrcStride = static_cast<UINT>(Config.DimM * SrcEltSize);
+    SrcInfo.SrcSize = static_cast<UINT>(Config.DimM * Config.DimN * SrcEltSize);
 
     DestInfo.DestLayout = Config.MatrixLayout;
     DestInfo.DestStride = 0;
-    DestInfo.NumRows = (UINT)Config.DimM;
-    DestInfo.NumColumns = (UINT)Config.DimN;
+    DestInfo.NumRows = static_cast<UINT>(Config.DimM);
+    DestInfo.NumColumns = static_cast<UINT>(Config.DimN);
 
     if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
-      DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize);
+      DestInfo.DestStride = static_cast<UINT>(Config.DimM * DestEltSize);
     } else if (Config.MatrixLayout ==
                D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
-      DestInfo.DestStride = (UINT)(Config.DimM * DestEltSize);
+      DestInfo.DestStride = static_cast<UINT>(Config.DimM * DestEltSize);
     }
 
     // Create conversion info
@@ -13439,12 +13442,12 @@ float4 ps_main() : SV_Target {
     D3D12_RECT ScissorRect;
 
     memset(&Viewport, 0, sizeof(Viewport));
-    Viewport.Height = (float)RtDesc.Height;
-    Viewport.Width = (float)RtDesc.Width;
+    Viewport.Height = static_cast<float>(RtDesc.Height);
+    Viewport.Width = static_cast<float>(RtDesc.Width);
     Viewport.MaxDepth = 1.0f;
     memset(&ScissorRect, 0, sizeof(ScissorRect));
-    ScissorRect.right = (long)RtDesc.Width;
-    ScissorRect.bottom = RtDesc.Height;
+    ScissorRect.right = static_cast<LONG>(RtDesc.Width);
+    ScissorRect.bottom = static_cast<LONG>(RtDesc.Height);
     CommandList->SetGraphicsRootSignature(RootSignature);
     CommandList->SetGraphicsRootDescriptorTable(0, ResHandle);
     CommandList->SetGraphicsRootUnorderedAccessView(
@@ -13489,8 +13492,8 @@ float4 ps_main() : SV_Target {
     ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver
     ConvertInfo.DestInfo.DestLayout =
         D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
-    ConvertInfo.DestInfo.NumRows = (UINT)Config.DimM;
-    ConvertInfo.DestInfo.NumColumns = (UINT)Config.DimN;
+    ConvertInfo.DestInfo.NumRows = static_cast<UINT>(Config.DimM);
+    ConvertInfo.DestInfo.NumColumns = static_cast<UINT>(Config.DimN);
 
     if (AccumulateProps.AccumulationType ==
             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32 ||
@@ -13501,10 +13504,12 @@ float4 ps_main() : SV_Target {
         AccumulateProps.AccumulationType ==
             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
       ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32;
-      ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(float));
+      ConvertInfo.DestInfo.DestStride =
+          static_cast<UINT>(Config.DimN * sizeof(float));
     } else {
       ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8;
-      ConvertInfo.DestInfo.DestStride = (UINT)(Config.DimN * sizeof(int8_t));
+      ConvertInfo.DestInfo.DestStride =
+          static_cast<UINT>(Config.DimN * sizeof(int8_t));
     }
 
     // Get destination size using preview interface
@@ -13551,15 +13556,17 @@ float4 ps_main() : SV_Target {
   WaitForSignal(CommandQueue, FO);
 
   {
-    MappedData MappedData(MatrixRowMajorReadResource, (UINT)InputMatrix.size());
+    MappedData MappedData(MatrixRowMajorReadResource,
+                          static_cast<UINT>(InputMatrix.size()));
 
     float *ResultBuffer = (float *)MappedData.data();
     bool Equal = true;
-    for (size_t i = 0; i < (UINT)InputMatrix.size() / sizeof(float); i++) {
+    for (size_t i = 0;
+         i < static_cast<UINT>(InputMatrix.size() / sizeof(float)); i++) {
       if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) ||
           fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) {
         LogErrorFmt(L"Result mismatch at index %d", i);
-        LogErrorFmt(L"ResultBuffer[%d]: %f, ExpectedOutputBuffer[%d]: %f", i,
+        LogErrorFmt(L"ResultBuffer[%zu]: %f, ExpectedOutputBuffer[%zu]: %f", i,
                     ResultBuffer[i], i, ExpectedOutputBuffer[i]);
         Equal = false;
         break;

From 3efd69a9d7a478e0a34677f95b5e4a084a453f89 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 16:52:03 -0400
Subject: [PATCH 21/26] Fix missing static_cast in CoopVec.h

---
 tools/clang/unittests/HLSLExec/CoopVec.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index 23a8dae170..1db1c0c19b 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -757,21 +757,21 @@ class TestVector {
       DestEltSize = 1; // FP8
       break;
     }
-    ConvertInfo.SrcInfo.SrcStride = (UINT)getStride();
-    ConvertInfo.SrcInfo.SrcSize = (UINT)getTotalBytes();
+    ConvertInfo.SrcInfo.SrcStride = static_cast<UINT>(getStride());
+    ConvertInfo.SrcInfo.SrcSize = static_cast<UINT>(getTotalBytes());
 
     ConvertInfo.DestInfo.DestLayout = MatrixLayout;
     ConvertInfo.DestInfo.DestStride = 0;
-    ConvertInfo.DestInfo.NumRows = (UINT)getNumVectors();
-    ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize();
+    ConvertInfo.DestInfo.NumRows = static_cast<UINT>(getNumVectors());
+    ConvertInfo.DestInfo.NumColumns = static_cast<UINT>(getVectorSize());
 
     if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
       ConvertInfo.DestInfo.DestStride =
-          ((UINT)getVectorSize() * DestEltSize + 15) & ~15;
+          (static_cast<UINT>(getVectorSize()) * DestEltSize + 15) & ~15;
     } else if (MatrixLayout ==
                D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
       ConvertInfo.DestInfo.DestStride =
-          ((UINT)getNumVectors() * DestEltSize + 15) & ~15;
+          (static_cast<UINT>(getNumVectors()) * DestEltSize + 15) & ~15;
     }
 
     // Get destination size using preview interface

From f77d76f9aefd4b313b46ac0d591d2203a304194c Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Mon, 12 May 2025 16:57:25 -0400
Subject: [PATCH 22/26] Use proposed refactor for packed integer type
 exclusions in runCoopVecMulTestConfig

---
 .../unittests/HLSLExec/ExecutionTest.cpp      | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index f3dc75395e..ad1912f46b 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12317,19 +12317,23 @@ void ExecutionTest::runCoopVecMulTestConfig(
       continue;
     }
 
-    if (Config.NumLayers > 1 &&
-        (MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-         MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 ||
-         MulProps.InputInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-         MulProps.InputInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) &&
-        (MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 ||
-         MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32)) {
-      // We do not support multi-layer tests with packed types as input with
-      // full-precision integer bias Supporting this in the current framework
-      // would require repacking the accumulator vectors
-      continue;
+    if (Config.NumLayers > 1) {
+      const bool IsPackedType =
+          MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+          MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 ||
+          MulProps.InputInterpretation ==
+              D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+          MulProps.InputInterpretation ==
+              D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED;
+
+      const bool IsFullPrecisionIntegerBias =
+          MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 ||
+          MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32;
+
+      if (IsPackedType && IsFullPrecisionIntegerBias)
+        // In the current framework this would require repacking the accumulator
+        // vectors in HLSL.
+        continue;
     }
 
     bool IsInFilter = CoopVecHelpers::IsMatrixLayoutInFilter(

From 1e4662c46ac0b2224a4004b4c809f2a1e94cbe53 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 13 May 2025 09:02:06 -0400
Subject: [PATCH 23/26] Style fixes and rewrite of TestVector to use unique_ptr

---
 tools/clang/unittests/HLSLExec/CoopVec.h      | 199 +++++-------------
 .../unittests/HLSLExec/ExecutionTest.cpp      |  36 +++-
 2 files changed, 83 insertions(+), 152 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index 1db1c0c19b..e810117109 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -6,6 +6,7 @@
 #include <DirectXPackedVector.h>
 
 #include <cstdlib>
+#include <memory>
 #include <random>
 #include <vector>
 
@@ -13,7 +14,7 @@
 
 #include "CoopVecAPI.h"
 
-struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler {
+class LinAlgHeaderIncludeHandler : public IDxcIncludeHandler {
 private:
   DXC_MICROCOM_REF_FIELD(RefCount)
   dxc::DxcDllSupport &DxcSupport;
@@ -32,6 +33,8 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler {
       WEX::Common::String ParamValue;
       if (FAILED(WEX::TestExecution::RuntimeParameters::TryGetValue(
               L"LinAlgHeader", ParamValue))) {
+        WEX::Logging::Log::Error(
+            L"Missing expected TAEF runtime parameter LinAlgHeader");
         return E_FAIL;
       }
 
@@ -79,7 +82,7 @@ static std::vector<uint8_t> CreateAllOnesInputMatrix(size_t Width,
     } else if constexpr (std::is_same_v<EltTy, float>) {
       InputMatrix[i] = 1.0f;
     } else {
-      WEX::Logging::Log::Error(L"Unsupported input type");
+      VERIFY_FAIL(L"Unsupported input type");
       break;
     }
   }
@@ -91,60 +94,6 @@ static std::vector<uint8_t> CreateAllOnesInputMatrix(size_t Width,
   return Uint8InputMatrix;
 }
 
-template <typename EltTy>
-static std::vector<uint8_t> CreateInputVector(size_t NumThreads,
-                                              size_t EltsPerThread) {
-  std::vector<EltTy> InputVector(NumThreads * EltsPerThread);
-  std::fill(InputVector.begin(), InputVector.end(), EltTy(0));
-  if (EltsPerThread < 2) {
-    WEX::Logging::Log::Error(L"EltsPerThread must be at least 2");
-    return std::vector<uint8_t>();
-  }
-  for (size_t TID = 0; TID < NumThreads; TID++) {
-    if constexpr (std::is_same_v<EltTy, uint8_t> ||
-                  std::is_same_v<EltTy, int8_t>) {
-      InputVector[TID * EltsPerThread + 0] = 1;
-      InputVector[TID * EltsPerThread + 1] = 1;
-    } else if constexpr (std::is_same_v<EltTy, DirectX::PackedVector::HALF>) {
-      InputVector[TID * EltsPerThread + 0] = ConvertFloat32ToFloat16(1.0f);
-      InputVector[TID * EltsPerThread + 1] = ConvertFloat32ToFloat16(1.0f);
-    } else if constexpr (std::is_same_v<EltTy, float>) {
-      InputVector[TID * EltsPerThread + 0] = 1.0f;
-      InputVector[TID * EltsPerThread + 1] = 1.0f;
-    } else {
-      WEX::Logging::Log::Error(L"Unsupported input type");
-      break;
-    }
-  }
-
-  // Convert to uint8_t vector
-  std::vector<uint8_t> Uint8InputVector(InputVector.size() * sizeof(EltTy));
-  std::memcpy(Uint8InputVector.data(), InputVector.data(),
-              InputVector.size() * sizeof(EltTy));
-  return Uint8InputVector;
-}
-
-template <typename EltTy>
-static std::vector<uint8_t> CreateInputBias(size_t NumElts) {
-  std::vector<EltTy> InputBias(NumElts);
-  if constexpr (std::is_same_v<EltTy, uint8_t> ||
-                std::is_same_v<EltTy, int8_t>) {
-    std::fill(InputBias.begin(), InputBias.end(), EltTy(1));
-  } else if constexpr (std::is_same_v<EltTy, DirectX::PackedVector::HALF>) {
-    std::fill(InputBias.begin(), InputBias.end(),
-              ConvertFloat32ToFloat16(1.0f));
-  } else if constexpr (std::is_same_v<EltTy, int32_t>) {
-    std::fill(InputBias.begin(), InputBias.end(), 1);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported bias type");
-  }
-  // Convert to uint8_t vector
-  std::vector<uint8_t> Uint8InputBias(InputBias.size() * sizeof(EltTy));
-  std::memcpy(Uint8InputBias.data(), InputBias.data(),
-              InputBias.size() * sizeof(EltTy));
-  return Uint8InputBias;
-}
-
 static std::wstring
 DataTypeToFilterString(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
   switch (DataType) {
@@ -173,7 +122,9 @@ DataTypeToFilterString(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
   case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
     return L"FLOAT_E5M2";
   default:
-    return L"<UNKNOWN>";
+    VERIFY_FAIL(WEX::Common::String().Format(
+        L"Unrecognized D3D12_LINEAR_ALGEBRA_DATATYPE: %d", DataType));
+    return L"";
   }
 }
 
@@ -266,7 +217,7 @@ GetStrideMultiplierForMatrixDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
   case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
     return 4;
   default:
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
+    VERIFY_FAIL(L"Unsupported matrix data type");
     return 1;
   }
 }
@@ -302,8 +253,8 @@ GetHlslDataTypeForDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
   case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
     return L"float";
   default:
-    WEX::Logging::Log::Error(L"Unsupported input data type");
-    return L"<UNKNOWN>";
+    VERIFY_FAIL(L"Unsupported input data type");
+    return L"";
   }
 }
 
@@ -335,8 +286,8 @@ GetHlslInterpretationForDataType(D3D12_LINEAR_ALGEBRA_DATATYPE Interpretation) {
   case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
     return L"DATA_TYPE_FLOAT8_E5M2";
   default:
-    WEX::Logging::Log::Error(L"Unsupported interpretation");
-    return L"<UNKNOWN>";
+    VERIFY_FAIL(L"Unsupported interpretation");
+    return L"";
   }
 }
 
@@ -360,7 +311,7 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) {
   }
 }
 
-bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
+static bool IsIntegralDataType(D3D12_LINEAR_ALGEBRA_DATATYPE DataType) {
   return DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
          DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 ||
          DataType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16 ||
@@ -394,7 +345,8 @@ GetVectorElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataType,
   case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
     return sizeof(float);
   default:
-    throw std::invalid_argument("Unsupported data type");
+    VERIFY_FAIL(L"Unsupported data type");
+    return 0;
   }
 }
 
@@ -420,7 +372,8 @@ GetMatrixElementSize(D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
     // ConvertLinearAlgebraMatrix.
     return sizeof(float);
   default:
-    throw std::invalid_argument("Unsupported data type");
+    VERIFY_FAIL(L"Unsupported data type");
+    return 0;
   }
 }
 
@@ -431,7 +384,7 @@ class TestVector {
   size_t ElementSize = 0;
   size_t Stride = 0;
   size_t TotalBytes = 0;
-  uint8_t *Buffer = nullptr;
+  std::unique_ptr<uint8_t[]> Buffer;
 
 public:
   TestVector(size_t NumVectors, size_t VectorSize, size_t ElementSize,
@@ -439,24 +392,18 @@ class TestVector {
       : NumVectors(NumVectors), VectorSize(VectorSize),
         ElementSize(ElementSize) {
     if (NumVectors == 0)
-      throw std::invalid_argument("NumVectors must be greater than 0");
+      VERIFY_FAIL(L"NumVectors must be greater than 0");
     if (VectorSize == 0)
-      throw std::invalid_argument("VectorSize must be greater than 0");
+      VERIFY_FAIL(L"VectorSize must be greater than 0");
     if (ElementSize == 0)
-      throw std::invalid_argument("ElementSize must be greater than 0");
+      VERIFY_FAIL(L"ElementSize must be greater than 0");
 
     const size_t VectorBytes = VectorSize * ElementSize;
     Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment;
     TotalBytes = Stride * NumVectors;
 
-    void *Ptr = nullptr;
-#ifdef _MSC_VER
-    Ptr = _aligned_malloc(TotalBytes, Alignment);
-#else
-    Ptr = std::aligned_alloc(Alignment, TotalBytes);
-#endif
-    Buffer = reinterpret_cast<uint8_t *>(Ptr);
-    std::fill(Buffer, Buffer + TotalBytes, (uint8_t)0xFF);
+    Buffer = std::make_unique<uint8_t[]>(TotalBytes);
+    std::fill(Buffer.get(), Buffer.get() + TotalBytes, (uint8_t)0xFF);
   }
 
   // Copy constructor
@@ -464,17 +411,9 @@ class TestVector {
       : NumVectors(other.NumVectors), VectorSize(other.VectorSize),
         ElementSize(other.ElementSize), Stride(other.Stride),
         TotalBytes(other.TotalBytes) {
-
-    void *Ptr = nullptr;
-#ifdef _MSC_VER
-    Ptr = _aligned_malloc(TotalBytes, 16);
-#else
-    Ptr = std::aligned_alloc(16, TotalBytes);
-#endif
-    Buffer = reinterpret_cast<uint8_t *>(Ptr);
-
     if (other.Buffer) {
-      std::memcpy(Buffer, other.Buffer, TotalBytes);
+      Buffer = std::make_unique<uint8_t[]>(TotalBytes);
+      std::memcpy(Buffer.get(), other.Buffer.get(), TotalBytes);
     }
   }
 
@@ -482,48 +421,28 @@ class TestVector {
   TestVector(TestVector &&other) noexcept
       : NumVectors(other.NumVectors), VectorSize(other.VectorSize),
         ElementSize(other.ElementSize), Stride(other.Stride),
-        TotalBytes(other.TotalBytes), Buffer(other.Buffer) {
-
+        TotalBytes(other.TotalBytes), Buffer(std::move(other.Buffer)) {
     // Reset the source object
     other.NumVectors = 0;
     other.VectorSize = 0;
     other.ElementSize = 0;
     other.Stride = 0;
     other.TotalBytes = 0;
-    other.Buffer = nullptr;
   }
 
-  ~TestVector() {
-    if (Buffer) {
-#ifdef _MSC_VER
-      _aligned_free(Buffer);
-#else
-      std::free(Buffer);
-#endif
-    }
-  }
+  ~TestVector() = default;
 
   size_t getNumVectors() const { return NumVectors; }
   size_t getVectorSize() const { return VectorSize; }
   size_t getElementSize() const { return ElementSize; }
   size_t getStride() const { return Stride; }
   size_t getTotalBytes() const { return TotalBytes; }
-  uint8_t *getBuffer() { return Buffer; }
-  const uint8_t *getBuffer() const { return Buffer; }
+  uint8_t *getBuffer() { return Buffer.get(); }
+  const uint8_t *getBuffer() const { return Buffer.get(); }
 
   // Copy assignment operator
   TestVector &operator=(const TestVector &other) {
     if (this != &other) {
-      // Free existing buffer
-      if (Buffer) {
-#ifdef _MSC_VER
-        _aligned_free(Buffer);
-#else
-        std::free(Buffer);
-#endif
-        Buffer = nullptr;
-      }
-
       // Copy metadata
       NumVectors = other.NumVectors;
       VectorSize = other.VectorSize;
@@ -531,18 +450,13 @@ class TestVector {
       Stride = other.Stride;
       TotalBytes = other.TotalBytes;
 
-      // Allocate new buffer
-      void *Ptr = nullptr;
-#ifdef _MSC_VER
-      Ptr = _aligned_malloc(TotalBytes, 16);
-#else
-      Ptr = std::aligned_alloc(16, TotalBytes);
-#endif
-      Buffer = reinterpret_cast<uint8_t *>(Ptr);
-
       // Copy data
-      if (other.Buffer)
-        std::memcpy(Buffer, other.Buffer, TotalBytes);
+      if (other.Buffer) {
+        Buffer = std::make_unique<uint8_t[]>(TotalBytes);
+        std::memcpy(Buffer.get(), other.Buffer.get(), TotalBytes);
+      } else {
+        Buffer.reset();
+      }
     }
     return *this;
   }
@@ -550,22 +464,13 @@ class TestVector {
   // Move assignment operator
   TestVector &operator=(TestVector &&other) noexcept {
     if (this != &other) {
-      // Free existing buffer
-      if (Buffer) {
-#ifdef _MSC_VER
-        _aligned_free(Buffer);
-#else
-        std::free(Buffer);
-#endif
-      }
-
       // Move metadata and buffer
       NumVectors = other.NumVectors;
       VectorSize = other.VectorSize;
       ElementSize = other.ElementSize;
       Stride = other.Stride;
       TotalBytes = other.TotalBytes;
-      Buffer = other.Buffer;
+      Buffer = std::move(other.Buffer);
 
       // Reset the source object
       other.NumVectors = 0;
@@ -573,19 +478,16 @@ class TestVector {
       other.ElementSize = 0;
       other.Stride = 0;
       other.TotalBytes = 0;
-      other.Buffer = nullptr;
     }
     return *this;
   }
 
   template <typename T> T *getVector(size_t I) {
-    uint8_t *Ptr = Buffer + I * Stride;
-    return reinterpret_cast<T *>(Ptr);
+    return reinterpret_cast<T *>(Buffer.get() + I * Stride);
   }
 
   template <typename T> const T *getVector(size_t I) const {
-    const uint8_t *Ptr = Buffer + I * Stride;
-    return reinterpret_cast<const T *>(Ptr);
+    return reinterpret_cast<const T *>(Buffer.get() + I * Stride);
   }
 
   template <typename T> void fill(const T &Value) {
@@ -607,9 +509,9 @@ class TestVector {
           float Elt = 0.0f;
 
           if (IsIntegralDataType(MatrixInterpretation))
-            Elt = (float)(Rnd() & 0x7) - 3.0f;
+            Elt = static_cast<float>(Rnd() & 0x7) - 3.0f;
           else
-            Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f;
+            Elt = (static_cast<float>(Rnd() & 0x3) - 1.0f) / 2.0f;
 
           if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>)
             Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(Elt));
@@ -629,10 +531,10 @@ class TestVector {
       T *Vec = getVector<T>(I);
       for (size_t J = 0; J < VectorSize; ++J)
         if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
-          float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f;
+          float Elt = (static_cast<float>(Rnd() & 0x3) - 1.0f) / 2.0f;
           Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(Elt));
         } else if constexpr (std::is_same_v<T, float>) {
-          float Elt = ((float)(Rnd() & 0x3) - 1.0f) / 2.0f;
+          float Elt = (static_cast<float>(Rnd() & 0x3) - 1.0f) / 2.0f;
           Vec[J] = static_cast<T>(Elt);
         } else {
           if constexpr (std::is_signed_v<T>) {
@@ -688,7 +590,8 @@ class TestVector {
       Vec.fillSimpleTestData<float>(MatrixInterpretation, Rnd);
       break;
     default:
-      throw std::invalid_argument("Unsupported data type");
+      VERIFY_FAIL(L"Unsupported data type");
+      break;
     }
     return Vec;
   }
@@ -717,7 +620,8 @@ class TestVector {
       Vec.FillSimpleMatrixTestData<float>(Rnd);
       break;
     default:
-      throw std::invalid_argument("Unsupported data type");
+      VERIFY_FAIL(L"Unsupported data type");
+      break;
     }
     return Vec;
   }
@@ -834,8 +738,7 @@ class TestVector {
           if (HasBias)
             Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
 
-          float Result = Acc;
-          ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
+          ResultVec.getVector<float>(VecIdx)[OutputIdx] = Acc;
         }
       }
     } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
@@ -862,12 +765,12 @@ class TestVector {
           if (HasBias)
             Acc += InputBiasI32[OutputIdx];
 
-          float Result = float(Acc);
-          ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
+          ResultVec.getVector<float>(VecIdx)[OutputIdx] =
+              static_cast<float>(Acc);
         }
       }
     } else {
-      throw std::invalid_argument("Unsupported matrix interpretation");
+      VERIFY_FAIL(L"Unsupported matrix interpretation");
     }
 
     return ResultVec;
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index ad1912f46b..f29ecb8fb4 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12013,14 +12013,28 @@ void ExecutionTest::runCoopVecMulTest() {
 #else
   // Create device and verify coopvec support
   CComPtr<ID3D12Device> D3DDevice;
-  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9))
+  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) {
+#ifdef _HLK_CONF
+    LOG_ERROR_FMT_THROW(
+        L"Device does not support SM 6.9. Can't run these tests.");
+#else
+    WEX::Logging::Log::Comment(
+        "Device does not support SM 6.9. Can't run these tests.");
+    WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
+#endif
+  }
 
   if (!DoesDeviceSupportCooperativeVector(D3DDevice)) {
+#ifdef _HLK_CONF
+    LOG_ERROR_FMT_THROW(
+        L"Device does not support cooperative vectors. Can't run these tests.");
+#else
     WEX::Logging::Log::Comment(
-        "Device does not support cooperative vector. Skipping.");
+        "Device does not support cooperative vectors. Can't run these tests.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
+#endif
   }
 
   // Query coopvec feature data. First call gets the size of the arrays. The
@@ -12906,14 +12920,28 @@ void ExecutionTest::runCoopVecOuterProductTest() {
 #else
   // Create device and verify coopvec support
   CComPtr<ID3D12Device> D3DDevice;
-  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9))
+  if (!CreateDevice(&D3DDevice, D3D_SHADER_MODEL_6_9)) {
+#ifdef _HLK_CONF
+    LOG_ERROR_FMT_THROW(
+        L"Device does not support SM 6.9. Can't run these tests.");
+#else
+    WEX::Logging::Log::Comment(
+        "Device does not support SM 6.9. Can't run these tests.");
+    WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
+#endif
+  }
 
   if (!DoesDeviceSupportCooperativeVector(D3DDevice)) {
+#ifdef _HLK_CONF
+    LOG_ERROR_FMT_THROW(
+        L"Device does not support cooperative vectors. Can't run these tests.");
+#else
     WEX::Logging::Log::Comment(
-        "Device does not support cooperative vector. Skipping.");
+        "Device does not support cooperative vectors. Can't run these tests.");
     WEX::Logging::Log::Result(WEX::Logging::TestResults::Skipped);
     return;
+#endif
   }
 
   // Query coopvec feature data. First call gets the size of the arrays. The

From 44b1ce9c8e9a9c6308a1757e7990cd5967d56697 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 13 May 2025 09:10:20 -0400
Subject: [PATCH 24/26] Move PipelineState closer to first use

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index f29ecb8fb4..ccd6f2b8c5 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12446,9 +12446,6 @@ void ExecutionTest::runCoopVecMulSubtest(
         I == 0 ? MulProps.InputType : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32);
   }
 
-  // Create the compute pipeline state for the CoopVec shader
-  CComPtr<ID3D12PipelineState> PipelineState;
-
   std::string ShaderSource = R"(
 #include "dx/linalg.h"
 
@@ -12661,6 +12658,9 @@ float4 ps_main() : SV_Target {
   CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
       new LinAlgHeaderIncludeHandler(m_support);
 
+  // Create the pipeline state for the CoopVec shaders
+  CComPtr<ID3D12PipelineState> PipelineState;
+
   if (RunCompute) {
     CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9",
                      &PipelineState, Options.data(), (int)Options.size(),
@@ -13123,9 +13123,6 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
     return;
   }
 
-  // Create a compute pipeline state object.
-  CComPtr<ID3D12PipelineState> PipelineState;
-
   std::string ShaderSource = R"(
 #include "dx/linalg.h"
 
@@ -13261,6 +13258,9 @@ float4 ps_main() : SV_Target {
   CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
       new LinAlgHeaderIncludeHandler(m_support);
 
+  // Create the pipeline state for the CoopVec shaders
+  CComPtr<ID3D12PipelineState> PipelineState;
+
   if (RunCompute) {
     CreateComputePSO(D3DDevice, RootSignature, ShaderSource.c_str(), L"cs_6_9",
                      &PipelineState, Options, _countof(Options),

From 1275bd1f2f3a6776d39c03fd2e85930638d9cba2 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 13 May 2025 09:15:44 -0400
Subject: [PATCH 25/26] Rename creatAllOnesTestMatrix to createSimpleTestMatrix
 to reflect its current implementation

---
 tools/clang/unittests/HLSLExec/CoopVec.h         | 12 +++++++++---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp |  4 ++--
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index e810117109..294e63df5e 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -597,9 +597,9 @@ class TestVector {
   }
 
   static TestVector
-  createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize,
-                          D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation,
-                          std::mt19937 &Rnd) {
+  createSimpleTestMatrix(size_t NumVectors, size_t VectorSize,
+                         D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation,
+                         std::mt19937 &Rnd) {
     const size_t ElementSize =
         ::CoopVecHelpers::GetMatrixElementSize(DataInterpretation);
 
@@ -611,12 +611,18 @@ class TestVector {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      // The CPU reference matrix is always int8 for all integer
+      // interpretations. The GPU version will be converted to the destination
+      // format by ConvertLinearAlgebraMatrix.
       Vec.FillSimpleMatrixTestData<int8_t>(Rnd);
       break;
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      // The CPU reference matrix is always FP32 for all FP interpretations.
+      // The GPU version will be converted to the destination format by
+      // ConvertLinearAlgebraMatrix.
       Vec.FillSimpleMatrixTestData<float>(Rnd);
       break;
     default:
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index ccd6f2b8c5..b54ebe6f95 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12420,12 +12420,12 @@ void ExecutionTest::runCoopVecMulSubtest(
   for (size_t I = 0; I < Config.NumLayers - 1; ++I) {
     // Each layer except the last is InputPerThread x InputPerThread
     InputMatrices.push_back(
-        ::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
+        ::CoopVecHelpers::TestVector::createSimpleTestMatrix(
             Config.InputPerThread, Config.InputPerThread,
             MulProps.MatrixInterpretation, Rnd));
   }
   // Last layer, matrix size is OutputPerThread x InputPerThread
-  InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
+  InputMatrices.push_back(::CoopVecHelpers::TestVector::createSimpleTestMatrix(
       Config.OutputPerThread, Config.InputPerThread,
       MulProps.MatrixInterpretation, Rnd));
 

From 237e0fb8457f9ae2d71e4ad4b1fa80a5df965c53 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 13 May 2025 14:33:37 -0400
Subject: [PATCH 26/26] Add comments for some magic numbers

---
 tools/clang/unittests/HLSLExec/CoopVec.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index 294e63df5e..c5c81800ac 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -508,6 +508,9 @@ class TestVector {
                       std::is_same_v<T, float>) {
           float Elt = 0.0f;
 
+          // Generate random input in the following ranges:
+          // - Integral types: [-3, 4] by 1
+          // - FP types: [-0.5, 1] by 0.5
           if (IsIntegralDataType(MatrixInterpretation))
             Elt = static_cast<float>(Rnd() & 0x7) - 3.0f;
           else
@@ -518,6 +521,9 @@ class TestVector {
           else
             Vec[J] = static_cast<T>(Elt);
         } else {
+          // Generate random input in the following ranges:
+          // - Signed types: [-8, 7] by 1
+          // - Unsigned types: [0, 15] by 1
           if constexpr (std::is_signed_v<T>)
             Vec[J] = static_cast<T>((int32_t)(Rnd() & 0xf) - 8);
           else
@@ -676,10 +682,12 @@ class TestVector {
     ConvertInfo.DestInfo.NumColumns = static_cast<UINT>(getVectorSize());
 
     if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
+      // Align to 16 bytes
       ConvertInfo.DestInfo.DestStride =
           (static_cast<UINT>(getVectorSize()) * DestEltSize + 15) & ~15;
     } else if (MatrixLayout ==
                D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
+      // Align to 16 bytes
       ConvertInfo.DestInfo.DestStride =
           (static_cast<UINT>(getNumVectors()) * DestEltSize + 15) & ~15;
     }