From fb890811c304a13d2318283b3b012b2804ee737c Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 6 May 2025 14:40:37 -0400
Subject: [PATCH 1/6] Clean up vector handling code by introducing TestVector

---
 tools/clang/unittests/HLSLExec/CoopVec.h      | 200 +++++++
 .../unittests/HLSLExec/ExecutionTest.cpp      | 532 +++++++-----------
 2 files changed, 416 insertions(+), 316 deletions(-)
diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index f166c61f67..cd24a556bd 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -4,6 +4,8 @@
 
 #include <DirectXMath.h>
 #include <DirectXPackedVector.h>
+
+#include <cstdlib>
 #include <vector>
 
 #include "dxc/Support/microcom.h"
@@ -61,6 +63,7 @@ struct LinAlgHeaderIncludeHandler : public IDxcIncludeHandler {
 };
 
 namespace CoopVecHelpers {
+
 template <typename EltTy>
 static std::vector<uint8_t> CreateAllOnesInputMatrix(uint32_t Width,
                                                      uint32_t Height) {
@@ -354,6 +357,203 @@ GetMatrixSrcDataType(D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation) {
     return D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32;
   }
 }
+
+struct TestVector {
+private:
+  size_t NumVectors = 0;
+  size_t VectorSize = 0;
+  size_t ElementSize = 0;
+  size_t Stride = 0;
+  size_t TotalBytes = 0;
+  uint8_t *Buffer = nullptr;
+
+public:
+  TestVector(size_t NumVectors, size_t VectorSize, size_t ElementSize,
+             size_t Alignment = 16)
+      : NumVectors(NumVectors), VectorSize(VectorSize),
+        ElementSize(ElementSize) {
+    if (NumVectors == 0) {
+      throw std::invalid_argument("NumVectors must be greater than 0");
+    }
+    if (VectorSize == 0) {
+      throw std::invalid_argument("VectorSize must be greater than 0");
+    }
+    if (ElementSize == 0) {
+      throw std::invalid_argument("ElementSize must be greater than 0");
+    }
+
+    size_t VectorBytes = VectorSize * ElementSize;
+    Stride = ((VectorBytes + Alignment - 1) / Alignment) * Alignment;
+    TotalBytes = Stride * NumVectors;
+
+    void *Ptr = nullptr;
+#ifdef _MSC_VER
+    Ptr = _aligned_malloc(TotalBytes, Alignment);
+#else
+    Ptr = std::aligned_alloc(Alignment, TotalBytes);
+#endif
+    Buffer = reinterpret_cast<uint8_t *>(Ptr);
+    std::fill(Buffer, Buffer + TotalBytes, (uint8_t)0xFF);
+  }
+
+  // Copy constructor
+  TestVector(const TestVector &other)
+      : NumVectors(other.NumVectors), VectorSize(other.VectorSize),
+        ElementSize(other.ElementSize), Stride(other.Stride),
+        TotalBytes(other.TotalBytes) {
+
+    void *Ptr = nullptr;
+#ifdef _MSC_VER
+    Ptr = _aligned_malloc(TotalBytes, 16);
+#else
+    Ptr = std::aligned_alloc(16, TotalBytes);
+#endif
+    Buffer = reinterpret_cast<uint8_t *>(Ptr);
+
+    if (other.Buffer) {
+      std::memcpy(Buffer, other.Buffer, TotalBytes);
+    }
+  }
+
+  // Move constructor
+  TestVector(TestVector &&other) noexcept
+      : NumVectors(other.NumVectors), VectorSize(other.VectorSize),
+        ElementSize(other.ElementSize), Stride(other.Stride),
+        TotalBytes(other.TotalBytes), Buffer(other.Buffer) {
+
+    // Reset the source object
+    other.NumVectors = 0;
+    other.VectorSize = 0;
+    other.ElementSize = 0;
+    other.Stride = 0;
+    other.TotalBytes = 0;
+    other.Buffer = nullptr;
+  }
+
+  ~TestVector() {
+    if (Buffer) {
+#ifdef _MSC_VER
+      _aligned_free(Buffer);
+#else
+      std::free(Buffer);
+#endif
+    }
+  }
+
+  size_t getNumVectors() const { return NumVectors; }
+  size_t getVectorSize() const { return VectorSize; }
+  size_t getElementSize() const { return ElementSize; }
+  size_t getStride() const { return Stride; }
+  size_t getTotalBytes() const { return TotalBytes; }
+  uint8_t *getBuffer() { return Buffer; }
+  const uint8_t *getBuffer() const { return Buffer; }
+
+  template <typename T> T *getVector(size_t I) {
+    uint8_t *Ptr = Buffer + I * Stride;
+    return reinterpret_cast<T *>(Ptr);
+  }
+
+  template <typename T> const T *getVector(size_t I) const {
+    const uint8_t *Ptr = Buffer + I * Stride;
+    return reinterpret_cast<const T *>(Ptr);
+  }
+
+  template <typename T> void fill(const T &Value) {
+    for (size_t I = 0; I < NumVectors; ++I) {
+      T *Vec = getVector<T>(I);
+      for (size_t J = 0; J < VectorSize; ++J)
+        Vec[J] = Value;
+    }
+  }
+
+  template <typename T> void fillSimpleTestData() {
+    // Create a vector of (1, 1, 0, ...)
+    for (size_t I = 0; I < NumVectors; ++I) {
+      T *Vec = getVector<T>(I);
+      for (size_t J = 0; J < VectorSize; ++J)
+        if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
+          // Special case for HALF, which requires conversion from float
+          Vec[J] = static_cast<T>(
+              ConvertFloat32ToFloat16((J == 0 || J == 1) ? 1.0f : 0.0f));
+        } else {
+          Vec[J] = static_cast<T>((J == 0 || J == 1) ? 1 : 0);
+        }
+    }
+  }
+
+  static TestVector
+  createSimpleTestVector(size_t NumVectors, size_t VectorSize,
+                         D3D12_LINEAR_ALGEBRA_DATATYPE DataType,
+                         D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+    size_t ElementSize;
+    switch (DataType) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+      ElementSize = sizeof(int8_t);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+      ElementSize = sizeof(int16_t);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+          DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) {
+        ElementSize = sizeof(int8_t);
+      } else {
+        ElementSize = sizeof(int32_t);
+      }
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+      ElementSize = sizeof(DirectX::PackedVector::HALF);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      ElementSize = sizeof(float);
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    TestVector Vec(NumVectors, VectorSize, ElementSize);
+    switch (DataType) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+      Vec.fillSimpleTestData<int8_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+      Vec.fillSimpleTestData<uint8_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+      Vec.fillSimpleTestData<int16_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+      Vec.fillSimpleTestData<uint16_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+      Vec.fillSimpleTestData<int32_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      if (DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+          DataInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) {
+        Vec.fillSimpleTestData<uint8_t>();
+      } else {
+        Vec.fillSimpleTestData<uint32_t>();
+      }
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+      Vec.fillSimpleTestData<DirectX::PackedVector::HALF>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      Vec.fillSimpleTestData<float>();
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    return Vec;
+  }
+};
 }; // namespace CoopVecHelpers
 
 #endif // HAVE_COOPVEC_API
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 55d569dd8d..f47b4624d6 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12241,6 +12241,112 @@ void ExecutionTest::runCoopVecMulSubtest(
   CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle(
       DescriptorHeap->GetCPUDescriptorHandleForHeapStart());
 
+  // Setup input data
+  auto ExpectedOutputBuffer =
+      std::make_unique<float[]>(Config.OutputPerThread * Config.NumThreads);
+
+  std::vector<uint8_t> InputMatrix;
+  if (MulProps.MatrixInterpretation ==
+          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+      MulProps.MatrixInterpretation ==
+          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
+      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(
+        Config.InputPerThread, Config.OutputPerThread);
+  } else if (MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
+    // Matrix source data is fp32, which gets converted to fp16 during matrix
+    // conversion
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(
+        Config.InputPerThread, Config.OutputPerThread);
+  } else {
+    WEX::Logging::Log::Error(L"Unsupported matrix data type");
+    return;
+  }
+
+  auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector(
+      Config.NumThreads, Config.InputPerThread, MulProps.InputType,
+      MulProps.InputInterpretation);
+  auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector(
+      1, Config.OutputPerThread, MulProps.BiasInterpretation,
+      MulProps.BiasInterpretation);
+
+  // Calculate reference output
+  // FIXME: This does not capture all cases, but is sufficient for the preview
+  // feature set
+  if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
+    int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer();
+    float *InputVectorF32 = (float *)InputVector.getBuffer();
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
+        int Acc = 0;
+
+        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
+          int InputElem;
+          if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+            InputElem = (int)
+                InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx];
+          } else {
+            InputElem = InputVector.getVector<int8_t>(ThreadIdx)[InputIdx];
+          }
+          int const MatrixElem =
+              InputMatrix[OutputIdx * Config.InputPerThread + InputIdx];
+          Acc += InputElem * MatrixElem;
+        }
+
+        if (Config.Bias) {
+          Acc += InputBiasI32[OutputIdx];
+        }
+
+        float Result = float(Acc);
+        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
+            Result;
+      }
+    }
+  } else if (MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
+             MulProps.MatrixInterpretation ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
+    DirectX::PackedVector::HALF *InputVectorFP16 =
+        (DirectX::PackedVector::HALF *)InputVector.getBuffer();
+    DirectX::PackedVector::HALF *InputBiasFP16 =
+        (DirectX::PackedVector::HALF *)InputBias.getBuffer();
+
+    // The CPU reference matrix is float
+    std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
+    std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
+        float Acc = 0;
+
+        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
+          float const InputElem = ConvertFloat16ToFloat32(
+              InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]);
+          float const MatrixElem =
+              InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
+          Acc += InputElem * MatrixElem;
+        }
+
+        if (Config.Bias) {
+          Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
+        }
+
+        float Result = Acc;
+        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
+            Result;
+      }
+    }
+  }
+
   // Create the compute pipeline state for the CoopVec shader
   CComPtr<ID3D12PipelineState> ComputePipelineState;
   {
@@ -12258,9 +12364,7 @@ void main(uint threadIdx : SV_GroupThreadID)
 {
   using namespace dx::linalg;
 
-  // Ensure 4-byte alignment for vector loads
-  uint inputOffset = (INPUT_PER_THREAD * threadIdx * (sizeof(INPUT_DATA_TYPE) / INPUT_DIVISOR));
-  inputOffset = (inputOffset + 3) & ~3; // Align to 4 bytes
+  uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE);
   vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> >(inputOffset);
 
   MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix, 0, STRIDE };
@@ -12278,7 +12382,6 @@ void main(uint threadIdx : SV_GroupThreadID)
 
   // Ensure 4-byte alignment for vector store
   uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float);
-  outputOffset = (outputOffset + 3) & ~3; // Align to 4 bytes
   OutputBuffer.Store<vector<float, OUTPUT_PER_THREAD> >(outputOffset, result);
 }
     )";
@@ -12349,6 +12452,8 @@ void main(uint threadIdx : SV_GroupThreadID)
     auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0);
     auto AccumInterpretationEnumDefine = CreateDefineFromString(
         L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum);
+    auto InputVectorStrideDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride());
 
     LPCWSTR Options[] = {
         L"-enable-16bit-types",
@@ -12364,6 +12469,7 @@ void main(uint threadIdx : SV_GroupThreadID)
         MatrixDataTypeEnumDefine.c_str(),
         UseBiasDefine.c_str(),
         AccumInterpretationEnumDefine.c_str(),
+        InputVectorStrideDefine.c_str(),
     };
 
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
@@ -12388,36 +12494,9 @@ void main(uint threadIdx : SV_GroupThreadID)
       0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
       IID_PPV_ARGS(&CommandList)));
 
-  // Setup input data
-  auto ExpectedOutputBuffer =
-      std::make_unique<float[]>(Config.OutputPerThread * Config.NumThreads);
-
   // Setup input matrix as all-ones in sint8 format. This will later be
   // converted to the appropriate data type by the matrix conversion API.
   CComPtr<ID3D12Resource> InputMatrixSRVResource, InputMatrixSRVUploadResource;
-  std::vector<uint8_t> InputMatrix;
-  if (MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // Matrix source data is fp32, which gets converted to fp16 during matrix
-    // conversion
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
-    return;
-  }
 
   CreateTestResources(D3DDevice, CommandList, InputMatrix.data(),
                       InputMatrix.size(),
@@ -12427,180 +12506,31 @@ void main(uint threadIdx : SV_GroupThreadID)
   // Create input vector of an appropriate type. All integer types start as
   // SINT8 for now.
   CComPtr<ID3D12Resource> InputVecSRVResource, InputVecSRVUploadResource;
-  std::vector<uint8_t> InputVector;
-
-  if ((MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32 &&
-       (MulProps.InputInterpretation ==
-            D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-        MulProps.InputInterpretation ==
-            D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED)) ||
-      MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputVector = CoopVecHelpers::CreateInputVector<int8_t>(
-        Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    InputVector =
-        CoopVecHelpers::CreateInputVector<DirectX::PackedVector::HALF>(
-            Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    InputVector = CoopVecHelpers::CreateInputVector<float>(
-        Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) {
-    InputVector = CoopVecHelpers::CreateInputVector<int32_t>(
-        Config.NumThreads, Config.InputPerThread);
-  } else if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) {
-    InputVector = CoopVecHelpers::CreateInputVector<uint32_t>(
-        Config.NumThreads, Config.InputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported input data type");
-    return;
-  }
-  if (InputVector.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputVector.resize(InputVector.size() + 4 - (InputVector.size() % 4));
-  }
-  CreateTestResources(D3DDevice, CommandList, InputVector.data(),
-                      InputVector.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputVector.size()),
-                      &InputVecSRVResource, &InputVecSRVUploadResource);
+
+  CreateTestResources(
+      D3DDevice, CommandList, InputVector.getBuffer(),
+      InputVector.getTotalBytes(),
+      CD3DX12_RESOURCE_DESC::Buffer(InputVector.getTotalBytes()),
+      &InputVecSRVResource, &InputVecSRVUploadResource);
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector.size() / sizeof(int32_t)),
+               (UINT)(InputVector.getTotalBytes() / sizeof(int32_t)),
                InputVecSRVResource);
 
   // Create input bias
   CComPtr<ID3D12Resource> InputBiasSRVResource, InputBiasSRVUploadResource;
-  std::vector<uint8_t> InputBias;
 
-  if (MulProps.BiasInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-      MulProps.BiasInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
-      MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputBias = CoopVecHelpers::CreateInputBias<int8_t>(Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32) {
-    InputBias =
-        CoopVecHelpers::CreateInputBias<int32_t>(Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32) {
-    InputBias =
-        CoopVecHelpers::CreateInputBias<uint32_t>(Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
-    InputBias = CoopVecHelpers::CreateInputBias<DirectX::PackedVector::HALF>(
-        Config.OutputPerThread);
-  } else if (MulProps.BiasInterpretation ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    InputBias = CoopVecHelpers::CreateInputBias<float>(Config.OutputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported bias data type");
-    return;
-  }
-
-  if (InputBias.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputBias.resize(InputBias.size() + 4 - (InputBias.size() % 4));
-  }
-  CreateTestResources(D3DDevice, CommandList, InputBias.data(),
-                      InputBias.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputBias.size()),
+  CreateTestResources(D3DDevice, CommandList, InputBias.getBuffer(),
+                      InputBias.getTotalBytes(),
+                      CD3DX12_RESOURCE_DESC::Buffer(InputBias.getTotalBytes()),
                       &InputBiasSRVResource, &InputBiasSRVUploadResource);
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputBias.size() / sizeof(int32_t)),
+               (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)),
                InputBiasSRVResource);
 
-  // Calculate reference output
-  // FIXME: This does not capture all cases, but is sufficient for the preview
-  // feature set
-  if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
-    // The input bias is really an array of int32_t
-    std::vector<int32_t> InputBiasI32(InputBias.size() / sizeof(int32_t));
-    std::memcpy(InputBiasI32.data(), InputBias.data(), InputBias.size());
-
-    // The input vector is really an array of float if our vector input type is
-    // FLOAT32
-    std::vector<float> InputVectorF32(InputVector.size() / sizeof(int32_t));
-    if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-      std::memcpy(InputVectorF32.data(), InputVector.data(),
-                  InputVector.size());
-    }
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        int Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          int InputElem;
-          if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-            InputElem = (int)
-                InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx];
-          } else {
-            InputElem =
-                InputVector[ThreadIdx * Config.InputPerThread + InputIdx];
-          }
-          int const MatrixElem =
-              InputMatrix[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += InputBiasI32[OutputIdx];
-        }
-
-        float Result = float(Acc);
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // The input bias/vector is really an array of float16
-    std::vector<DirectX::PackedVector::HALF> InputVectorFP16(
-        InputVector.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputVectorFP16.data(), InputVector.data(), InputVector.size());
-
-    std::vector<DirectX::PackedVector::HALF> InputBiasFP16(
-        InputBias.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputBiasFP16.data(), InputBias.data(), InputBias.size());
-
-    // The CPU reference matrix is float
-    std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
-    std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        float Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          float const InputElem = ConvertFloat16ToFloat32(
-              InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]);
-          float const MatrixElem =
-              InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
-        }
-
-        float Result = Acc;
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
-  }
-
   CComPtr<ID3D12Resource> ConvertedMatrixResource;
   {
     // Create source matrix info
@@ -12862,6 +12792,80 @@ void ExecutionTest::runCoopVecOuterProductSubtest(
   CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle(
       DescriptorHeap->GetCPUDescriptorHandleForHeapStart());
 
+  // Setup input matrix as all-ones in sint8/fp32 format. This will later be
+  // converted to the appropriate data type by the matrix conversion API.
+
+  std::vector<uint8_t> InputMatrix;
+  if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+      AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(Config.DimN,
+                                                                   Config.DimM);
+  } else if (AccumulateProps.AccumulationType ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
+             AccumulateProps.AccumulationType ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
+             AccumulateProps.AccumulationType ==
+                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
+    // Matrix source data is fp32, which gets converted to fp16 during matrix
+    // conversion
+    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN,
+                                                                  Config.DimM);
+  } else {
+    WEX::Logging::Log::Error(L"Unsupported matrix data type");
+    return;
+  }
+
+  // Create input vectors
+  auto InputVector1 = CoopVecHelpers::TestVector::createSimpleTestVector(
+      Config.NumThreads, Config.DimM, AccumulateProps.InputType,
+      AccumulateProps.InputType);
+  auto InputVector2 = CoopVecHelpers::TestVector::createSimpleTestVector(
+      Config.NumThreads, Config.DimN, AccumulateProps.InputType,
+      AccumulateProps.InputType);
+
+  // Calculate reference output
+  auto ExpectedOutputBufferI8 =
+      CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN, Config.DimM);
+  std::vector<float> ExpectedOutputBuffer(ExpectedOutputBufferI8.size() /
+                                          sizeof(float));
+  std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(),
+              ExpectedOutputBufferI8.size());
+
+  if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
+    DirectX::PackedVector::HALF *InputVector1FP16 =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(
+            InputVector1.getBuffer());
+    DirectX::PackedVector::HALF *InputVector2FP16 =
+        reinterpret_cast<DirectX::PackedVector::HALF *>(
+            InputVector2.getBuffer());
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int M = 0; M < Config.DimM; ++M) {
+        for (int N = 0; N < Config.DimN; ++N) {
+          float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) *
+                      ConvertFloat16ToFloat32(InputVector2FP16[N]);
+          ExpectedOutputBuffer[M * Config.DimN + N] += acc;
+        }
+      }
+    }
+  } else if (AccumulateProps.InputType ==
+             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+    float *InputVector1FP32 =
+        reinterpret_cast<float *>(InputVector1.getBuffer());
+    float *InputVector2FP32 =
+        reinterpret_cast<float *>(InputVector2.getBuffer());
+
+    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      for (int M = 0; M < Config.DimM; ++M) {
+        for (int N = 0; N < Config.DimN; ++N) {
+          float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] *
+                      InputVector2FP32[ThreadIdx * Config.DimN + N];
+          ExpectedOutputBuffer[M * Config.DimN + N] += Acc;
+        }
+      }
+    }
+  }
+
   // Create a compute pipeline state object.
   CComPtr<ID3D12PipelineState> ComputePipelineState;
   {
@@ -12880,12 +12884,10 @@ void main(uint threadIdx : SV_GroupThreadID)
   using namespace dx::linalg;
 
   // Ensure 4-byte alignment for vector loads
-  uint inputOffset1 = (DIM_M * threadIdx * sizeof(INPUT_DATA_TYPE));
-  inputOffset1 = (inputOffset1 + 3) & ~3; // Align to 4 bytes
+  uint inputOffset1 = threadIdx * INPUT_VECTOR_1_STRIDE;
   vector<INPUT_DATA_TYPE, DIM_M / INPUT_DIVISOR> input1 = InputVector1.Load<vector<INPUT_DATA_TYPE, DIM_M / INPUT_DIVISOR> >(inputOffset1);
 
-  uint inputOffset2 = (DIM_N * threadIdx * sizeof(INPUT_DATA_TYPE));
-  inputOffset2 = (inputOffset2 + 3) & ~3; // Align to 4 bytes
+  uint inputOffset2 = threadIdx * INPUT_VECTOR_2_STRIDE;
   vector<INPUT_DATA_TYPE, DIM_N / INPUT_DIVISOR> input2 = InputVector2.Load<vector<INPUT_DATA_TYPE, DIM_N / INPUT_DIVISOR> >(inputOffset2);
 
   RWMatrixRef<MATRIX_DATA_TYPE_ENUM, DIM_M, DIM_N, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { AccumMatrix, 0, STRIDE };
@@ -12954,6 +12956,10 @@ void main(uint threadIdx : SV_GroupThreadID)
         CreateDefineFromString(L"HLSL_MATRIX_LAYOUT", HlslMatrixLayout.c_str());
     auto MatrixDataTypeEnumDefine = CreateDefineFromString(
         L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum.c_str());
+    auto InputVector1StrideDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_1_STRIDE", (int)InputVector1.getStride());
+    auto InputVector2StrideDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_2_STRIDE", (int)InputVector2.getStride());
 
     LPCWSTR Options[] = {
         L"-enable-16bit-types",
@@ -12967,6 +12973,8 @@ void main(uint threadIdx : SV_GroupThreadID)
         InputInterpretationEnumDefine.c_str(),
         HlslMatrixLayoutDefine.c_str(),
         MatrixDataTypeEnumDefine.c_str(),
+        InputVector1StrideDefine.c_str(),
+        InputVector2StrideDefine.c_str(),
     };
 
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
@@ -12991,142 +12999,34 @@ void main(uint threadIdx : SV_GroupThreadID)
       0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
       IID_PPV_ARGS(&CommandList)));
 
-  // Setup input matrix as all-ones in sint8/fp32 format. This will later be
-  // converted to the appropriate data type by the matrix conversion API.
   CComPtr<ID3D12Resource> InputMatrixSRVResource, InputMatrixSRVUploadResource;
-  std::vector<uint8_t> InputMatrix;
-  if (AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      AccumulateProps.AccumulationType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(Config.DimN,
-                                                                   Config.DimM);
-  } else if (AccumulateProps.AccumulationType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             AccumulateProps.AccumulationType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             AccumulateProps.AccumulationType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // Matrix source data is fp32, which gets converted to fp16 during matrix
-    // conversion
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN,
-                                                                  Config.DimM);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
-    return;
-  }
-
   CreateTestResources(D3DDevice, CommandList, InputMatrix.data(),
                       InputMatrix.size(),
                       CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()),
                       &InputMatrixSRVResource, &InputMatrixSRVUploadResource);
 
-  // Create input vectors
   CComPtr<ID3D12Resource> InputVecSRVResource1, InputVecSRVUploadResource1;
-  std::vector<uint8_t> InputVector1;
   CComPtr<ID3D12Resource> InputVecSRVResource2, InputVecSRVUploadResource2;
-  std::vector<uint8_t> InputVector2;
-
-  if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputVector1 = CoopVecHelpers::CreateInputVector<int8_t>(Config.NumThreads,
-                                                             Config.DimM);
-    InputVector2 = CoopVecHelpers::CreateInputVector<int8_t>(Config.NumThreads,
-                                                             Config.DimN);
-  } else if (AccumulateProps.InputType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             AccumulateProps.InputType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             AccumulateProps.InputType ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    InputVector1 =
-        CoopVecHelpers::CreateInputVector<DirectX::PackedVector::HALF>(
-            Config.NumThreads, Config.DimM);
-    InputVector2 =
-        CoopVecHelpers::CreateInputVector<DirectX::PackedVector::HALF>(
-            Config.NumThreads, Config.DimN);
-  } else if (AccumulateProps.InputType ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    InputVector1 = CoopVecHelpers::CreateInputVector<float>(Config.NumThreads,
-                                                            Config.DimM);
-    InputVector2 = CoopVecHelpers::CreateInputVector<float>(Config.NumThreads,
-                                                            Config.DimN);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported input data type");
-    return;
-  }
-  if (InputVector1.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputVector1.resize(InputVector1.size() + 4 - (InputVector1.size() % 4));
-  }
-  if (InputVector2.size() % 4 != 0) {
-    // Align size to 4 bytes for ByteAddressBuffer
-    InputVector2.resize(InputVector2.size() + 4 - (InputVector2.size() % 4));
-  }
-  CreateTestResources(D3DDevice, CommandList, InputVector1.data(),
-                      InputVector1.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputVector1.size()),
-                      &InputVecSRVResource1, &InputVecSRVUploadResource1);
-  CreateTestResources(D3DDevice, CommandList, InputVector2.data(),
-                      InputVector2.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputVector2.size()),
-                      &InputVecSRVResource2, &InputVecSRVUploadResource2);
+
+  CreateTestResources(
+      D3DDevice, CommandList, InputVector1.getBuffer(),
+      InputVector1.getTotalBytes(),
+      CD3DX12_RESOURCE_DESC::Buffer(InputVector1.getTotalBytes()),
+      &InputVecSRVResource1, &InputVecSRVUploadResource1);
+  CreateTestResources(
+      D3DDevice, CommandList, InputVector2.getBuffer(),
+      InputVector2.getTotalBytes(),
+      CD3DX12_RESOURCE_DESC::Buffer(InputVector2.getTotalBytes()),
+      &InputVecSRVResource2, &InputVecSRVUploadResource2);
 
   // This increments baseHandle
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector1.size() / sizeof(int32_t)),
+               (UINT)(InputVector1.getTotalBytes() / sizeof(int32_t)),
                InputVecSRVResource1);
   CreateRawSRV(D3DDevice, BaseHandle,
-               (UINT)(InputVector2.size() / sizeof(int32_t)),
+               (UINT)(InputVector2.getTotalBytes() / sizeof(int32_t)),
                InputVecSRVResource2);
 
-  // Calculate reference output
-  auto ExpectedOutputBufferI8 =
-      CoopVecHelpers::CreateAllOnesInputMatrix<float>(Config.DimN, Config.DimM);
-  std::vector<float> ExpectedOutputBuffer(ExpectedOutputBufferI8.size() /
-                                          sizeof(float));
-  std::memcpy(ExpectedOutputBuffer.data(), ExpectedOutputBufferI8.data(),
-              ExpectedOutputBufferI8.size());
-
-  if (AccumulateProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16) {
-    std::vector<DirectX::PackedVector::HALF> InputVector1FP16(
-        InputVector1.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputVector1FP16.data(), InputVector1.data(),
-                InputVector1.size());
-
-    std::vector<DirectX::PackedVector::HALF> InputVector2FP16(
-        InputVector2.size() / sizeof(DirectX::PackedVector::HALF));
-    std::memcpy(InputVector2FP16.data(), InputVector2.data(),
-                InputVector2.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int M = 0; M < Config.DimM; ++M) {
-        for (int N = 0; N < Config.DimN; ++N) {
-          float acc = ConvertFloat16ToFloat32(InputVector1FP16[M]) *
-                      ConvertFloat16ToFloat32(InputVector2FP16[N]);
-          ExpectedOutputBuffer[M * Config.DimN + N] += acc;
-        }
-      }
-    }
-  } else if (AccumulateProps.InputType ==
-             D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-    std::vector<float> InputVector1FP32(InputVector1.size() / sizeof(float));
-    std::memcpy(InputVector1FP32.data(), InputVector1.data(),
-                InputVector1.size());
-
-    std::vector<float> InputVector2FP32(InputVector2.size() / sizeof(float));
-    std::memcpy(InputVector2FP32.data(), InputVector2.data(),
-                InputVector2.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      for (int M = 0; M < Config.DimM; ++M) {
-        for (int N = 0; N < Config.DimN; ++N) {
-          float Acc = InputVector1FP32[ThreadIdx * Config.DimM + M] *
-                      InputVector2FP32[ThreadIdx * Config.DimN + N];
-          ExpectedOutputBuffer[M * Config.DimN + N] += Acc;
-        }
-      }
-    }
-  }
-
   CComPtr<ID3D12Resource> ConvertedMatrixResource, ConvertedMatrixReadResource;
   int ConvertedMatrixSize = 0;
   {

From 721087a382acf72acb888a0a49320226c48f8a28 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Tue, 6 May 2025 16:17:37 -0400
Subject: [PATCH 2/6] Support odd matrix/vector sizes

---
 .../unittests/HLSLExec/ExecutionTest.cpp      | 84 +++++++++++++------
 1 file changed, 58 insertions(+), 26 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index f47b4624d6..934210af1f 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12149,6 +12149,14 @@ void ExecutionTest::runCoopVecMulTestConfig(
       {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
       {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
@@ -12157,6 +12165,14 @@ void ExecutionTest::runCoopVecMulTestConfig(
       {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
       {16, 16, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
@@ -12165,6 +12181,14 @@ void ExecutionTest::runCoopVecMulTestConfig(
       {32, 8, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
        false},
       {16, 16, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
@@ -12181,6 +12205,22 @@ void ExecutionTest::runCoopVecMulTestConfig(
        false},
       {32, 8, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
        true},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 16, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
   };
 
   for (auto Config : TestConfigs) {
@@ -12280,18 +12320,15 @@ void ExecutionTest::runCoopVecMulSubtest(
   // FIXME: This does not capture all cases, but is sufficient for the preview
   // feature set
   if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
-    int32_t *InputBiasI32 = (int32_t *)InputBias.getBuffer();
-    float *InputVectorF32 = (float *)InputVector.getBuffer();
-
     for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      int32_t *InputBiasI32 = InputBias.getVector<int32_t>(0);
       for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
         int Acc = 0;
 
         for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
           int InputElem;
           if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-            InputElem = (int)
-                InputVectorF32[ThreadIdx * Config.InputPerThread + InputIdx];
+            InputElem = (int)InputVector.getVector<float>(ThreadIdx)[InputIdx];
           } else {
             InputElem = InputVector.getVector<int8_t>(ThreadIdx)[InputIdx];
           }
@@ -12315,22 +12352,21 @@ void ExecutionTest::runCoopVecMulSubtest(
                  D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
              MulProps.MatrixInterpretation ==
                  D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    DirectX::PackedVector::HALF *InputVectorFP16 =
-        (DirectX::PackedVector::HALF *)InputVector.getBuffer();
-    DirectX::PackedVector::HALF *InputBiasFP16 =
-        (DirectX::PackedVector::HALF *)InputBias.getBuffer();
-
     // The CPU reference matrix is float
     std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
     std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
 
     for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
+      DirectX::PackedVector::HALF *InputVectorFP16 =
+          InputVector.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
+      DirectX::PackedVector::HALF *InputBiasFP16 =
+          InputBias.getVector<DirectX::PackedVector::HALF>(0);
       for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
         float Acc = 0;
 
         for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          float const InputElem = ConvertFloat16ToFloat32(
-              InputVectorFP16[ThreadIdx * Config.InputPerThread + InputIdx]);
+          float const InputElem =
+              ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]);
           float const MatrixElem =
               InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
           Acc += InputElem * MatrixElem;
@@ -12365,7 +12401,7 @@ void main(uint threadIdx : SV_GroupThreadID)
   using namespace dx::linalg;
 
   uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE);
-  vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_PER_THREAD / INPUT_DIVISOR> >(inputOffset);
+  vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> >(inputOffset);
 
   MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix, 0, STRIDE };
 
@@ -12439,8 +12475,9 @@ void main(uint threadIdx : SV_GroupThreadID)
     auto StrideDefine = CreateDefineFromInt(L"STRIDE", Stride);
     auto InputDataTypeDefine =
         CreateDefineFromString(L"INPUT_DATA_TYPE", InputDataType);
-    auto InputDivisorDefine =
-        CreateDefineFromInt(L"INPUT_DIVISOR", InputDivisor);
+    auto InputDivisorDefine = CreateDefineFromInt(
+        L"INPUT_VECTOR_NUM_ELEMENTS",
+        (Config.InputPerThread + InputDivisor - 1) / InputDivisor);
     auto AccumDataTypeDefine =
         CreateDefineFromString(L"ACCUM_DATA_TYPE", AccumDataType);
     auto InputInterpretationEnumDefine = CreateDefineFromString(
@@ -12596,11 +12633,12 @@ void main(uint threadIdx : SV_GroupThreadID)
           &ConvertInfo.DestInfo);
     }
 
+    int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16;
+
     // Create resource to hold matrix copy
-    CreateTestResources(
-        D3DDevice, CommandList, nullptr, 0,
-        CD3DX12_RESOURCE_DESC::Buffer(ConvertInfo.DestInfo.DestSize),
-        &ConvertedMatrixResource, nullptr);
+    CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize,
+                        CD3DX12_RESOURCE_DESC::Buffer(SRVSize),
+                        &ConvertedMatrixResource, nullptr);
 
     // Set up data descriptors
     ConvertInfo.DataDesc.DestVA =
@@ -12613,13 +12651,7 @@ void main(uint threadIdx : SV_GroupThreadID)
         __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11));
     CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
-    // This increments baseHandle
-    if ((ConvertInfo.DestInfo.DestSize % 4) != 0) {
-      WEX::Logging::Log::Error(L"DestSize is not aligned to 4 bytes");
-      return;
-    }
-    CreateRawSRV(D3DDevice, BaseHandle,
-                 ConvertInfo.DestInfo.DestSize / sizeof(int32_t),
+    CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t),
                  ConvertedMatrixResource);
   }
 

From 5dde799dc0d0cff77468aa877e5446b2414dac36 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 13:10:59 -0400
Subject: [PATCH 3/6] Finish support for NumLayers=2

---
 tools/clang/unittests/HLSLExec/CoopVec.h      | 275 ++++++++++++
 .../unittests/HLSLExec/ExecutionTest.cpp      | 410 +++++++++---------
 2 files changed, 492 insertions(+), 193 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index cd24a556bd..b5c0a2f355 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -448,6 +448,74 @@ struct TestVector {
   uint8_t *getBuffer() { return Buffer; }
   const uint8_t *getBuffer() const { return Buffer; }
 
+  // Copy assignment operator
+  TestVector &operator=(const TestVector &other) {
+    if (this != &other) {
+      // Free existing buffer
+      if (Buffer) {
+#ifdef _MSC_VER
+        _aligned_free(Buffer);
+#else
+        std::free(Buffer);
+#endif
+        Buffer = nullptr;
+      }
+
+      // Copy metadata
+      NumVectors = other.NumVectors;
+      VectorSize = other.VectorSize;
+      ElementSize = other.ElementSize;
+      Stride = other.Stride;
+      TotalBytes = other.TotalBytes;
+
+      // Allocate new buffer
+      void *Ptr = nullptr;
+#ifdef _MSC_VER
+      Ptr = _aligned_malloc(TotalBytes, 16);
+#else
+      Ptr = std::aligned_alloc(16, TotalBytes);
+#endif
+      Buffer = reinterpret_cast<uint8_t *>(Ptr);
+
+      // Copy data
+      if (other.Buffer) {
+        std::memcpy(Buffer, other.Buffer, TotalBytes);
+      }
+    }
+    return *this;
+  }
+
+  // Move assignment operator
+  TestVector &operator=(TestVector &&other) noexcept {
+    if (this != &other) {
+      // Free existing buffer
+      if (Buffer) {
+#ifdef _MSC_VER
+        _aligned_free(Buffer);
+#else
+        std::free(Buffer);
+#endif
+      }
+
+      // Move metadata and buffer
+      NumVectors = other.NumVectors;
+      VectorSize = other.VectorSize;
+      ElementSize = other.ElementSize;
+      Stride = other.Stride;
+      TotalBytes = other.TotalBytes;
+      Buffer = other.Buffer;
+
+      // Reset the source object
+      other.NumVectors = 0;
+      other.VectorSize = 0;
+      other.ElementSize = 0;
+      other.Stride = 0;
+      other.TotalBytes = 0;
+      other.Buffer = nullptr;
+    }
+    return *this;
+  }
+
   template <typename T> T *getVector(size_t I) {
     uint8_t *Ptr = Buffer + I * Stride;
     return reinterpret_cast<T *>(Ptr);
@@ -481,6 +549,20 @@ struct TestVector {
     }
   }
 
+  template <typename T> void fillAllOnesTestData() {
+    // Create a vector of (1, 1, 1, ...)
+    for (size_t I = 0; I < NumVectors; ++I) {
+      T *Vec = getVector<T>(I);
+      for (size_t J = 0; J < VectorSize; ++J)
+        if constexpr (std::is_same_v<T, DirectX::PackedVector::HALF>) {
+          // Special case for HALF, which requires conversion from float
+          Vec[J] = static_cast<T>(ConvertFloat32ToFloat16(1.0f));
+        } else {
+          Vec[J] = static_cast<T>(1);
+        }
+    }
+  }
+
   static TestVector
   createSimpleTestVector(size_t NumVectors, size_t VectorSize,
                          D3D12_LINEAR_ALGEBRA_DATATYPE DataType,
@@ -553,6 +635,199 @@ struct TestVector {
     }
     return Vec;
   }
+
+  static TestVector
+  createAllOnesTestMatrix(size_t NumVectors, size_t VectorSize,
+                          D3D12_LINEAR_ALGEBRA_DATATYPE DataInterpretation) {
+    size_t ElementSize;
+    switch (DataInterpretation) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      ElementSize = sizeof(int8_t);
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      ElementSize = sizeof(float);
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    TestVector Vec(NumVectors, VectorSize, ElementSize);
+    switch (DataInterpretation) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32:
+      Vec.fillAllOnesTestData<int8_t>();
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32:
+      Vec.fillAllOnesTestData<float>();
+      break;
+    default:
+      throw std::invalid_argument("Unsupported data type");
+    }
+    return Vec;
+  }
+
+  D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO
+  getConversionInfo(ID3D12Device *D3DDevice,
+                    D3D12_LINEAR_ALGEBRA_DATATYPE DestDataType,
+                    D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout) {
+    // Create source matrix info
+    D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {};
+    ConvertInfo.SrcInfo.SrcDataType =
+        ::CoopVecHelpers::GetMatrixSrcDataType(DestDataType);
+    ConvertInfo.SrcInfo.SrcLayout =
+        D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
+
+    // Create destination matrix info
+    ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver
+    int DestEltSize = 0;
+    switch (DestDataType) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED:
+      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8;
+      DestEltSize = 1;
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16;
+      DestEltSize = 2; // FP16
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+      ConvertInfo.DestInfo.DestDataType =
+          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3;
+      DestEltSize = 1; // FP8
+      break;
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+      ConvertInfo.DestInfo.DestDataType =
+          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2;
+      DestEltSize = 1; // FP8
+      break;
+    }
+    ConvertInfo.SrcInfo.SrcStride = (UINT)getStride();
+    ConvertInfo.SrcInfo.SrcSize = (UINT)getTotalBytes();
+
+    ConvertInfo.DestInfo.DestLayout = MatrixLayout;
+    ConvertInfo.DestInfo.DestStride = 0;
+    ConvertInfo.DestInfo.NumRows = (UINT)getNumVectors();
+    ConvertInfo.DestInfo.NumColumns = (UINT)getVectorSize();
+
+    if (MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
+      ConvertInfo.DestInfo.DestStride = (UINT)getVectorSize() * DestEltSize;
+    } else if (MatrixLayout ==
+               D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
+      ConvertInfo.DestInfo.DestStride = (UINT)getNumVectors() * DestEltSize;
+    }
+
+    // Get destination size using preview interface
+    {
+      CComPtr<ID3D12DevicePreview> PreviewDevice;
+      VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview),
+                                                 (void **)&PreviewDevice));
+
+      // Query required destination size
+      PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo(
+          &ConvertInfo.DestInfo);
+    }
+
+    return ConvertInfo;
+  }
+
+  static TestVector
+  matrixVectorMultiply(const TestVector &Matrix, const TestVector &InputVector,
+                       const TestVector &Bias, bool HasBias,
+                       D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation,
+                       D3D12_LINEAR_ALGEBRA_DATATYPE InputType) {
+    bool IsFP32 = false;
+    switch (MatrixInterpretation) {
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
+    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
+      IsFP32 = true;
+      break;
+    default:
+      break;
+    }
+
+    TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(),
+                         sizeof(float));
+
+    if (IsFP32) {
+      for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
+        const DirectX::PackedVector::HALF *InputBiasFP16 =
+            Bias.getVector<DirectX::PackedVector::HALF>(0);
+        for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
+             ++OutputIdx) {
+          float Acc = 0;
+
+          for (int InputIdx = 0; InputIdx < Matrix.getVectorSize();
+               ++InputIdx) {
+            float InputElem;
+            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+              InputElem = InputVector.getVector<float>(VecIdx)[InputIdx];
+            } else {
+              InputElem = ConvertFloat16ToFloat32(
+                  InputVector.getVector<DirectX::PackedVector::HALF>(
+                      VecIdx)[InputIdx]);
+            }
+            float const MatrixElem =
+                Matrix.getVector<float>(OutputIdx)[InputIdx];
+            Acc += InputElem * MatrixElem;
+          }
+
+          if (HasBias) {
+            Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
+          }
+
+          float Result = Acc;
+          ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
+        }
+      }
+    } else if (MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
+      for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
+        const int32_t *InputBiasI32 = Bias.getVector<int32_t>(0);
+        for (int OutputIdx = 0; OutputIdx < Matrix.getNumVectors();
+             ++OutputIdx) {
+          int Acc = 0;
+
+          for (int InputIdx = 0; InputIdx < Matrix.getVectorSize();
+               ++InputIdx) {
+            int InputElem;
+            if (InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
+              InputElem = (int)InputVector.getVector<float>(VecIdx)[InputIdx];
+            } else {
+              InputElem = InputVector.getVector<int8_t>(VecIdx)[InputIdx];
+            }
+            int const MatrixElem =
+                Matrix.getVector<int8_t>(OutputIdx)[InputIdx];
+            Acc += InputElem * MatrixElem;
+          }
+
+          if (HasBias) {
+            Acc += InputBiasI32[OutputIdx];
+          }
+
+          float Result = float(Acc);
+          ResultVec.getVector<float>(VecIdx)[OutputIdx] = Result;
+        }
+      }
+    } else {
+      throw std::invalid_argument("Unsupported matrix interpretation");
+    }
+
+    return ResultVec;
+  }
 };
 }; // namespace CoopVecHelpers
 
diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 934210af1f..a613f28139 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -789,7 +789,7 @@ class ExecutionTest {
     int InputPerThread;
     int OutputPerThread;
     int NumThreads;
-    int NumLevels;
+    int NumLayers;
     D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT MatrixLayout;
     bool Bias;
   };
@@ -12221,6 +12221,88 @@ void ExecutionTest::runCoopVecMulTestConfig(
        false},
       {1, 1, 32, 1, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
        true},
+
+      // NumLayers=2 tests
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR, true},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR, true},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_MUL_OPTIMAL, true},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {16, 16, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {16, 16, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {32, 8, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {32, 8, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {17, 63, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 16, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       false},
+      {1, 1, 32, 2, D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_OUTER_PRODUCT_OPTIMAL,
+       true},
   };
 
   for (auto Config : TestConfigs) {
@@ -12234,6 +12316,21 @@ void ExecutionTest::runCoopVecMulTestConfig(
       continue;
     }
 
+    if (Config.NumLayers > 1 &&
+        (MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
+         MulProps.InputInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8 ||
+         MulProps.InputInterpretation ==
+             D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
+         MulProps.InputInterpretation ==
+             D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED) &&
+        (MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT32 ||
+         MulProps.BiasInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT32)) {
+      // We do not support multi-layer tests with packed types as input with
+      // full-precision integer bias Supporting this in the current framework
+      // would require repacking the accumulator vectors
+      continue;
+    }
+
     bool IsInFilter = CoopVecHelpers::IsMatrixLayoutInFilter(
         L"CoopVecMatrixLayout", Config.MatrixLayout);
     if (!IsInFilter) {
@@ -12250,9 +12347,9 @@ void ExecutionTest::runCoopVecMulSubtest(
 
   LogCommentFmt(
       L"Running test for InputPerThread: %d, OutputPerThread: %d, NumThreads: "
-      L"%d, NumLevels: %d, Bias: %s, MatrixLayout: %s",
+      L"%d, NumLayers: %d, Bias: %s, MatrixLayout: %s",
       Config.InputPerThread, Config.OutputPerThread, Config.NumThreads,
-      Config.NumLevels, Config.Bias ? L"true" : L"false",
+      Config.NumLayers, Config.Bias ? L"true" : L"false",
       CoopVecHelpers::MatrixLayoutToFilterString(Config.MatrixLayout).c_str());
 
   const int OutputBufferSize = (Config.OutputPerThread * Config.NumThreads * 4);
@@ -12261,8 +12358,8 @@ void ExecutionTest::runCoopVecMulSubtest(
   CComPtr<ID3D12RootSignature> RootSignature;
   {
     CD3DX12_DESCRIPTOR_RANGE Ranges[2];
-    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 3, 0,
-                   0); // InputVector, InputMatrix, InputBias
+    Ranges[0].Init(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 2 + Config.NumLayers, 0,
+                   0); // InputVector, InputBias, InputMatrices[]
     Ranges[1].Init(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 1, 0, 0); // OutputBuffer
     CreateRootSignatureFromRanges(D3DDevice, &RootSignature, Ranges, 2, nullptr,
                                   0);
@@ -12273,7 +12370,7 @@ void ExecutionTest::runCoopVecMulSubtest(
   {
     D3D12_DESCRIPTOR_HEAP_DESC Desc = {};
     Desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-    Desc.NumDescriptors = 4;
+    Desc.NumDescriptors = 3 + Config.NumLayers;
     Desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
     VERIFY_SUCCEEDED(
         D3DDevice->CreateDescriptorHeap(&Desc, IID_PPV_ARGS(&DescriptorHeap)));
@@ -12281,106 +12378,35 @@ void ExecutionTest::runCoopVecMulSubtest(
   CD3DX12_CPU_DESCRIPTOR_HANDLE BaseHandle(
       DescriptorHeap->GetCPUDescriptorHandleForHeapStart());
 
-  // Setup input data
-  auto ExpectedOutputBuffer =
-      std::make_unique<float[]>(Config.OutputPerThread * Config.NumThreads);
-
-  std::vector<uint8_t> InputMatrix;
-  if (MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation ==
-          D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8_T4_PACKED ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8 ||
-      MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_UINT8) {
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<int8_t>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // Matrix source data is fp32, which gets converted to fp16 during matrix
-    // conversion
-    InputMatrix = CoopVecHelpers::CreateAllOnesInputMatrix<float>(
-        Config.InputPerThread, Config.OutputPerThread);
-  } else {
-    WEX::Logging::Log::Error(L"Unsupported matrix data type");
-    return;
-  }
+  // Our input matrix is really a set of row vectors, which we can represent
+  // as a TestVector.
+  std::vector<::CoopVecHelpers::TestVector> InputMatrices;
+  for (int I = 0; I < Config.NumLayers - 1; ++I) {
+    // Each layer except the last is InputPerThread x InputPerThread
+    InputMatrices.push_back(
+        ::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
+            Config.InputPerThread, Config.InputPerThread,
+            MulProps.MatrixInterpretation));
+  }
+  // Last layer, matrix size is OutputPerThread x InputPerThread
+  InputMatrices.push_back(::CoopVecHelpers::TestVector::createAllOnesTestMatrix(
+      Config.OutputPerThread, Config.InputPerThread,
+      MulProps.MatrixInterpretation));
 
   auto InputVector = CoopVecHelpers::TestVector::createSimpleTestVector(
       Config.NumThreads, Config.InputPerThread, MulProps.InputType,
       MulProps.InputInterpretation);
   auto InputBias = CoopVecHelpers::TestVector::createSimpleTestVector(
-      1, Config.OutputPerThread, MulProps.BiasInterpretation,
-      MulProps.BiasInterpretation);
+      1, std::max(Config.OutputPerThread, Config.InputPerThread),
+      MulProps.BiasInterpretation, MulProps.BiasInterpretation);
 
   // Calculate reference output
-  // FIXME: This does not capture all cases, but is sufficient for the preview
-  // feature set
-  if (MulProps.MatrixInterpretation == D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8) {
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      int32_t *InputBiasI32 = InputBias.getVector<int32_t>(0);
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        int Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          int InputElem;
-          if (MulProps.InputType == D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32) {
-            InputElem = (int)InputVector.getVector<float>(ThreadIdx)[InputIdx];
-          } else {
-            InputElem = InputVector.getVector<int8_t>(ThreadIdx)[InputIdx];
-          }
-          int const MatrixElem =
-              InputMatrix[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += InputBiasI32[OutputIdx];
-        }
-
-        float Result = float(Acc);
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
-  } else if (MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3 ||
-             MulProps.MatrixInterpretation ==
-                 D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2) {
-    // The CPU reference matrix is float
-    std::vector<float> InputMatrixFP32(InputMatrix.size() / sizeof(float));
-    std::memcpy(InputMatrixFP32.data(), InputMatrix.data(), InputMatrix.size());
-
-    for (int ThreadIdx = 0; ThreadIdx < Config.NumThreads; ++ThreadIdx) {
-      DirectX::PackedVector::HALF *InputVectorFP16 =
-          InputVector.getVector<DirectX::PackedVector::HALF>(ThreadIdx);
-      DirectX::PackedVector::HALF *InputBiasFP16 =
-          InputBias.getVector<DirectX::PackedVector::HALF>(0);
-      for (int OutputIdx = 0; OutputIdx < Config.OutputPerThread; ++OutputIdx) {
-        float Acc = 0;
-
-        for (int InputIdx = 0; InputIdx < Config.InputPerThread; ++InputIdx) {
-          float const InputElem =
-              ConvertFloat16ToFloat32(InputVectorFP16[InputIdx]);
-          float const MatrixElem =
-              InputMatrixFP32[OutputIdx * Config.InputPerThread + InputIdx];
-          Acc += InputElem * MatrixElem;
-        }
-
-        if (Config.Bias) {
-          Acc += ConvertFloat16ToFloat32(InputBiasFP16[OutputIdx]);
-        }
-
-        float Result = Acc;
-        ExpectedOutputBuffer[ThreadIdx * Config.OutputPerThread + OutputIdx] =
-            Result;
-      }
-    }
+  auto ExpectedOutput = InputVector;
+  for (int I = 0; I < Config.NumLayers; ++I) {
+    ExpectedOutput = ::CoopVecHelpers::TestVector::matrixVectorMultiply(
+        InputMatrices[I], ExpectedOutput, InputBias, Config.Bias,
+        MulProps.MatrixInterpretation,
+        I == 0 ? MulProps.InputType : D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT32);
   }
 
   // Create the compute pipeline state for the CoopVec shader
@@ -12391,7 +12417,7 @@ void ExecutionTest::runCoopVecMulSubtest(
 
 ByteAddressBuffer InputVector : register(t0);
 ByteAddressBuffer InputBias : register(t1);
-ByteAddressBuffer InputMatrix : register(t2);
+ByteAddressBuffer InputMatrix[NUM_LAYERS] : register(t2);
 RWByteAddressBuffer OutputBuffer: register(u0);
 
 [shader("compute")]
@@ -12402,25 +12428,57 @@ void main(uint threadIdx : SV_GroupThreadID)
 
   uint inputOffset = (threadIdx * INPUT_VECTOR_STRIDE);
   vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> input = InputVector.Load<vector<INPUT_DATA_TYPE, INPUT_VECTOR_NUM_ELEMENTS> >(inputOffset);
+  VectorRef<BIAS_INTERPRETATION_ENUM> biasVec = { InputBias, 0 };
+
+  vector<ACCUM_DATA_TYPE, OUTPUT_PER_THREAD> output;
+)";
+
+    if (Config.NumLayers == 1) {
+      ShaderSource += R"(
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix[0], 0, STRIDE };
+
+  if (USE_BIAS) {
+    output = MulAdd<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
+  } else {
+    output = Mul<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+  }
+)";
+    } else if (Config.NumLayers == 2) {
+      ShaderSource += R"(
+  vector<ACCUM_DATA_TYPE, INPUT_PER_THREAD> accum;
 
-  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat = { InputMatrix, 0, STRIDE };
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, INPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat0 = { InputMatrix[0], 0, STRIDE };
+  if (USE_BIAS) {
+    accum = MulAdd<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
+    //accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+  } else {
+    accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+  }
 
-  vector<ACCUM_DATA_TYPE, OUTPUT_PER_THREAD> accum;
+  // Dummy activation function; all of our intermediates are positive (currently).
+  accum = max(accum, 0);
 
+  MatrixRef<MATRIX_DATA_TYPE_ENUM, OUTPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat1 = { InputMatrix[1], 0, STRIDE };
   if (USE_BIAS) {
-    VectorRef<ACCUM_INTERPRETATION_ENUM> biasVec = { InputBias, 0 };
-    accum = MulAdd<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
+    output = MulAdd<ACCUM_DATA_TYPE>(mat1, MakeInterpretedVector<ACCUM_INTERPRETATION_ENUM>(accum), biasVec);
   } else {
-    accum = Mul<ACCUM_DATA_TYPE>(mat, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
+    output = Mul<ACCUM_DATA_TYPE>(mat1, MakeInterpretedVector<ACCUM_INTERPRETATION_ENUM>(accum));
   }
+)";
+    }
 
-  vector<float, OUTPUT_PER_THREAD> result = (vector<float, OUTPUT_PER_THREAD>)accum;
+    ShaderSource += R"(
+  vector<float, OUTPUT_PER_THREAD> result = (vector<float, OUTPUT_PER_THREAD>)output;
 
   // Ensure 4-byte alignment for vector store
   uint outputOffset = OUTPUT_PER_THREAD * threadIdx * sizeof(float);
   OutputBuffer.Store<vector<float, OUTPUT_PER_THREAD> >(outputOffset, result);
 }
-    )";
+)";
+
+#if 0
+    printf("%s\n", ShaderSource.c_str());
+#endif
 
     auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
       std::wstringstream Stream;
@@ -12462,7 +12520,7 @@ void main(uint threadIdx : SV_GroupThreadID)
     const std::wstring InputInterpretationEnum =
         CoopVecHelpers::GetHlslInterpretationForDataType(
             MulProps.InputInterpretation);
-    const std::wstring AccumInterpretationEnum =
+    const std::wstring BiasInterpretationEnum =
         CoopVecHelpers::GetHlslInterpretationForDataType(
             MulProps.BiasInterpretation);
 
@@ -12487,10 +12545,15 @@ void main(uint threadIdx : SV_GroupThreadID)
     auto MatrixDataTypeEnumDefine =
         CreateDefineFromString(L"MATRIX_DATA_TYPE_ENUM", MatrixDataTypeEnum);
     auto UseBiasDefine = CreateDefineFromInt(L"USE_BIAS", Config.Bias ? 1 : 0);
+    // Treat the accumulator interpretation the same as the input interpretation
+    // for the purposes of MakeInterpretedVector.
     auto AccumInterpretationEnumDefine = CreateDefineFromString(
-        L"ACCUM_INTERPRETATION_ENUM", AccumInterpretationEnum);
+        L"ACCUM_INTERPRETATION_ENUM", InputInterpretationEnum);
     auto InputVectorStrideDefine = CreateDefineFromInt(
         L"INPUT_VECTOR_STRIDE", (int)InputVector.getStride());
+    auto NumLayersDefine = CreateDefineFromInt(L"NUM_LAYERS", Config.NumLayers);
+    auto BiasInterpretationEnumDefine = CreateDefineFromString(
+        L"BIAS_INTERPRETATION_ENUM", BiasInterpretationEnum);
 
     LPCWSTR Options[] = {
         L"-enable-16bit-types",
@@ -12507,8 +12570,18 @@ void main(uint threadIdx : SV_GroupThreadID)
         UseBiasDefine.c_str(),
         AccumInterpretationEnumDefine.c_str(),
         InputVectorStrideDefine.c_str(),
+        NumLayersDefine.c_str(),
+        BiasInterpretationEnumDefine.c_str(),
     };
 
+#if 0
+    // Print options for debugging
+    WEX::Logging::Log::Comment(L"Shader compilation options:");
+    for (UINT i = 0; i < _countof(Options); i++) {
+      WEX::Logging::Log::Comment(Options[i]);
+    }
+#endif
+
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
         new LinAlgHeaderIncludeHandler(m_support);
 
@@ -12531,14 +12604,17 @@ void main(uint threadIdx : SV_GroupThreadID)
       0, D3D12_COMMAND_LIST_TYPE_DIRECT, CommandAllocator, ComputePipelineState,
       IID_PPV_ARGS(&CommandList)));
 
-  // Setup input matrix as all-ones in sint8 format. This will later be
-  // converted to the appropriate data type by the matrix conversion API.
-  CComPtr<ID3D12Resource> InputMatrixSRVResource, InputMatrixSRVUploadResource;
-
-  CreateTestResources(D3DDevice, CommandList, InputMatrix.data(),
-                      InputMatrix.size(),
-                      CD3DX12_RESOURCE_DESC::Buffer(InputMatrix.size()),
-                      &InputMatrixSRVResource, &InputMatrixSRVUploadResource);
+  std::vector<CComPtr<ID3D12Resource>> InputMatrixSRVResources(
+      Config.NumLayers);
+  std::vector<CComPtr<ID3D12Resource>> InputMatrixSRVUploadResources(
+      Config.NumLayers);
+  for (int I = 0; I < Config.NumLayers; ++I) {
+    CreateTestResources(
+        D3DDevice, CommandList, InputMatrices[I].getBuffer(),
+        InputMatrices[I].getTotalBytes(),
+        CD3DX12_RESOURCE_DESC::Buffer(InputMatrices[I].getTotalBytes()),
+        &InputMatrixSRVResources[I], &InputMatrixSRVUploadResources[I]);
+  }
 
   // Create input vector of an appropriate type. All integer types start as
   // SINT8 for now.
@@ -12568,82 +12644,25 @@ void main(uint threadIdx : SV_GroupThreadID)
                (UINT)(InputBias.getTotalBytes() / sizeof(int32_t)),
                InputBiasSRVResource);
 
-  CComPtr<ID3D12Resource> ConvertedMatrixResource;
-  {
-    // Create source matrix info
-    D3D12_LINEAR_ALGEBRA_MATRIX_CONVERSION_INFO ConvertInfo = {};
-    ConvertInfo.SrcInfo.SrcDataType =
-        CoopVecHelpers::GetMatrixSrcDataType(MulProps.MatrixInterpretation);
-    ConvertInfo.SrcInfo.SrcLayout =
-        D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR;
-
-    // Create destination matrix info
-    ConvertInfo.DestInfo.DestSize = 0; // Will be populated by driver
-    int SrcEltSize = 0;
-    int DestEltSize = 0;
-    switch (MulProps.MatrixInterpretation) {
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8:
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8_T4_PACKED:
-      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_SINT8;
-      SrcEltSize = 1;
-      DestEltSize = 1;
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
-      ConvertInfo.DestInfo.DestDataType = D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16;
-      SrcEltSize = 4;  // FP32
-      DestEltSize = 2; // FP16
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
-      ConvertInfo.DestInfo.DestDataType =
-          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3;
-      SrcEltSize = 4;  // FP32
-      DestEltSize = 1; // FP8
-      break;
-    case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
-      ConvertInfo.DestInfo.DestDataType =
-          D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2;
-      SrcEltSize = 4;  // FP32
-      DestEltSize = 1; // FP8
-      break;
-    }
-    ConvertInfo.SrcInfo.SrcStride = Config.InputPerThread * SrcEltSize;
-    ConvertInfo.SrcInfo.SrcSize =
-        Config.InputPerThread * Config.OutputPerThread * SrcEltSize;
-
-    ConvertInfo.DestInfo.DestLayout = Config.MatrixLayout;
-    ConvertInfo.DestInfo.DestStride = 0;
-    ConvertInfo.DestInfo.NumRows = Config.OutputPerThread;
-    ConvertInfo.DestInfo.NumColumns = Config.InputPerThread;
-
-    if (Config.MatrixLayout == D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_ROW_MAJOR) {
-      ConvertInfo.DestInfo.DestStride = Config.InputPerThread * DestEltSize;
-    } else if (Config.MatrixLayout ==
-               D3D12_LINEAR_ALGEBRA_MATRIX_LAYOUT_COLUMN_MAJOR) {
-      ConvertInfo.DestInfo.DestStride = Config.OutputPerThread * DestEltSize;
-    }
-
-    // Get destination size using preview interface
-    {
-      CComPtr<ID3D12DevicePreview> PreviewDevice;
-      VERIFY_SUCCEEDED(D3DDevice->QueryInterface(__uuidof(ID3D12DevicePreview),
-                                                 (void **)&PreviewDevice));
-
-      // Query required destination size
-      PreviewDevice->GetLinearAlgebraMatrixConversionDestinationInfo(
-          &ConvertInfo.DestInfo);
-    }
+  // Create converted matrix resource and SRV for each input matrix
+  std::vector<CComPtr<ID3D12Resource>> ConvertedMatrixResources(
+      Config.NumLayers);
+  for (int I = 0; I < Config.NumLayers; ++I) {
+    auto ConvertInfo = InputMatrices[I].getConversionInfo(
+        D3DDevice, MulProps.MatrixInterpretation, Config.MatrixLayout);
 
     int SRVSize = (ConvertInfo.DestInfo.DestSize + 15) / 16 * 16;
 
     // Create resource to hold matrix copy
     CreateTestResources(D3DDevice, CommandList, nullptr, SRVSize,
                         CD3DX12_RESOURCE_DESC::Buffer(SRVSize),
-                        &ConvertedMatrixResource, nullptr);
+                        &ConvertedMatrixResources[I], nullptr);
 
     // Set up data descriptors
     ConvertInfo.DataDesc.DestVA =
-        ConvertedMatrixResource->GetGPUVirtualAddress();
-    ConvertInfo.DataDesc.SrcVA = InputMatrixSRVResource->GetGPUVirtualAddress();
+        ConvertedMatrixResources[I]->GetGPUVirtualAddress();
+    ConvertInfo.DataDesc.SrcVA =
+        InputMatrixSRVResources[I]->GetGPUVirtualAddress();
 
     // Get command list interface and perform conversion
     CComPtr<ID3D12GraphicsCommandList11> CommandList11;
@@ -12651,8 +12670,9 @@ void main(uint threadIdx : SV_GroupThreadID)
         __uuidof(ID3D12GraphicsCommandList11), (void **)&CommandList11));
     CommandList11->ConvertLinearAlgebraMatrix(&ConvertInfo, 1);
 
+    // This increments BaseHandle
     CreateRawSRV(D3DDevice, BaseHandle, SRVSize / sizeof(int32_t),
-                 ConvertedMatrixResource);
+                 ConvertedMatrixResources[I]);
   }
 
   CComPtr<ID3D12Resource> UavResource;
@@ -12697,14 +12717,18 @@ void main(uint threadIdx : SV_GroupThreadID)
 
     float *ResultBuffer = (float *)MappedData.data();
     bool Equal = true;
-    for (int i = 0; i < OutputBufferSize / sizeof(float); i++) {
-      if (isnan(ResultBuffer[i]) || isnan(ExpectedOutputBuffer[i]) ||
-          fabs(ResultBuffer[i] - ExpectedOutputBuffer[i]) > 0.00001) {
-        LogErrorFmt(L"Result mismatch at index %d", i);
-        LogErrorFmt(L"ResultBuffer[%d]: %f, ExpectedOutputBuffer[%d]: %f", i,
-                    ResultBuffer[i], i, ExpectedOutputBuffer[i]);
-        Equal = false;
-        break;
+
+    for (int i = 0; i < Config.NumThreads; ++i) {
+      for (int j = 0; j < Config.OutputPerThread; ++j) {
+        float Result = ResultBuffer[i * Config.OutputPerThread + j];
+        float Expected = ExpectedOutput.getVector<float>(i)[j];
+        if (isnan(Result) || isnan(Expected) ||
+            fabs(Result - Expected) > 0.00001) {
+          LogErrorFmt(L"Result mismatch at index %d",
+                      i * Config.OutputPerThread + j);
+          LogErrorFmt(L"Result: %f, Expected: %f", Result, Expected);
+          Equal = false;
+        }
       }
     }
     VERIFY_IS_TRUE(Equal);

From a67edd490b2844ba259a74042305b564c7f4a870 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Wed, 7 May 2025 13:15:01 -0400
Subject: [PATCH 4/6] Remove dead code

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index a613f28139..3d69815034 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12476,10 +12476,6 @@ void main(uint threadIdx : SV_GroupThreadID)
 }
 )";
 
-#if 0
-    printf("%s\n", ShaderSource.c_str());
-#endif
-
     auto CreateDefineFromInt = [](const wchar_t *Name, int Value) {
       std::wstringstream Stream;
       Stream << L"-D" << Name << L"=" << Value;
@@ -12574,14 +12570,6 @@ void main(uint threadIdx : SV_GroupThreadID)
         BiasInterpretationEnumDefine.c_str(),
     };
 
-#if 0
-    // Print options for debugging
-    WEX::Logging::Log::Comment(L"Shader compilation options:");
-    for (UINT i = 0; i < _countof(Options); i++) {
-      WEX::Logging::Log::Comment(Options[i]);
-    }
-#endif
-
     CComPtr<LinAlgHeaderIncludeHandler> IncludeHandler =
         new LinAlgHeaderIncludeHandler(m_support);
 

From b2d35a973156b4b7a1948c520f4b6a0e2c15b6b5 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Thu, 8 May 2025 07:25:28 -0400
Subject: [PATCH 5/6] Remove dead line

---
 tools/clang/unittests/HLSLExec/ExecutionTest.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
index 3d69815034..ef769b12f7 100644
--- a/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
+++ b/tools/clang/unittests/HLSLExec/ExecutionTest.cpp
@@ -12450,7 +12450,6 @@ void main(uint threadIdx : SV_GroupThreadID)
   MatrixRef<MATRIX_DATA_TYPE_ENUM, INPUT_PER_THREAD, INPUT_PER_THREAD, HLSL_MATRIX_LAYOUT, /*transpose*/false> mat0 = { InputMatrix[0], 0, STRIDE };
   if (USE_BIAS) {
     accum = MulAdd<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input), biasVec);
-    //accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
   } else {
     accum = Mul<ACCUM_DATA_TYPE>(mat0, MakeInterpretedVector<INPUT_INTERPRETATION_ENUM>(input));
   }

From 11f3b6de20e9f574747d313b2c1c94ba4057b7e6 Mon Sep 17 00:00:00 2001
From: Justin Holewinski <jholewinski@nvidia.com>
Date: Thu, 8 May 2025 07:27:07 -0400
Subject: [PATCH 6/6] Add comment about ambiguous IsFP32 flag

---
 tools/clang/unittests/HLSLExec/CoopVec.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/CoopVec.h b/tools/clang/unittests/HLSLExec/CoopVec.h
index b5c0a2f355..18b8669197 100644
--- a/tools/clang/unittests/HLSLExec/CoopVec.h
+++ b/tools/clang/unittests/HLSLExec/CoopVec.h
@@ -749,12 +749,13 @@ struct TestVector {
                        const TestVector &Bias, bool HasBias,
                        D3D12_LINEAR_ALGEBRA_DATATYPE MatrixInterpretation,
                        D3D12_LINEAR_ALGEBRA_DATATYPE InputType) {
-    bool IsFP32 = false;
+    // The CPU reference matrix is FP32 for all FP interpretations.
+    bool IsMatrixFP32 = false;
     switch (MatrixInterpretation) {
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT16:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E4M3:
     case D3D12_LINEAR_ALGEBRA_DATATYPE_FLOAT_E5M2:
-      IsFP32 = true;
+      IsMatrixFP32 = true;
       break;
     default:
       break;
@@ -763,7 +764,7 @@ struct TestVector {
     TestVector ResultVec(InputVector.getNumVectors(), Matrix.getNumVectors(),
                          sizeof(float));
 
-    if (IsFP32) {
+    if (IsMatrixFP32) {
       for (int VecIdx = 0; VecIdx < InputVector.getNumVectors(); ++VecIdx) {
         const DirectX::PackedVector::HALF *InputBiasFP16 =
             Bias.getVector<DirectX::PackedVector::HALF>(0);