From 60ea20f7c23eae9b9b4f72b7ed9e50a9bf7a86dc Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Thu, 9 Apr 2026 15:52:21 -0600
Subject: [PATCH 1/6] [SM6.10][HLK] Fix GetElement test, add tranpose to helper

---
 .../clang/unittests/HLSLExec/LinAlgTests.cpp  | 82 +++++++++++--------
 1 file changed, 46 insertions(+), 36 deletions(-)
diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
index 2e4ce65d57..46b4dc15b3 100644
--- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
+++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
@@ -228,37 +228,47 @@ static bool fillInputBuffer(LPCSTR Name, std::vector<BYTE> &Data,
   return false;
 }
 
-static VariantCompType makeExpected(ComponentType CompType, size_t NumElements,
-                                    float StartingVal, bool Increment) {
-  switch (CompType) {
-  case ComponentType::F32: {
-    std::vector<float> Floats(NumElements);
-    for (size_t I = 0; I < NumElements; I++)
-      Floats[I] = StartingVal + static_cast<float>(Increment ? I : 0);
-    return Floats;
-  }
-  case ComponentType::I32: {
-    DXASSERT(StartingVal < static_cast<float>(INT_MAX),
-             "Value too large to cast to int32_t");
-    std::vector<int32_t> Ints(NumElements);
-    for (size_t I = 0; I < NumElements; I++)
-      Ints[I] = static_cast<int32_t>(StartingVal) +
-                static_cast<int32_t>(Increment ? I : 0);
-    return Ints;
-  }
-  case ComponentType::F16: {
-    std::vector<HLSLHalf_t> Halfs(NumElements);
-    for (size_t I = 0; I < NumElements; I++) {
-      // Downcasting is safe here since HLSLHalf_t will clamp if F is too large.
-      float F = StartingVal + static_cast<float>(Increment ? I : 0);
-      Halfs[I] = HLSLHalf_t(F);
+static VariantCompType makeExpected(ComponentType CompType, int32_t M, int32_t N,
+                                    float StartingVal, bool Increment = true, bool Transpose = false) {
+  int32_t NumElements = M * N;
+  std::vector<float> Floats(NumElements);
+  std::vector<int32_t> Ints(NumElements);
+  std::vector<HLSLHalf_t> Halfs(NumElements);
+
+  for (int32_t I = 0; I < M; ++I) {
+    for (int32_t J = 0; J < M; ++J) {
+      int32_t Value = I * M + J;
+      int32_t Idx = Transpose ? J * N + I : Value;
+      switch (CompType) {
+      case ComponentType::F32:
+        Floats[Idx] = StartingVal + static_cast<float>(Increment ? Value : 0);
+        break;
+      case ComponentType::I32:
+        DXASSERT(StartingVal < static_cast<float>(INT_MAX),
+                 "Value too large to cast to int32_t");
+        Ints[Idx] = static_cast<int32_t>(StartingVal) + (Increment ? Value : 0);
+        break;
+      case ComponentType::F16: {
+        // Downcasting is safe here since HLSLHalf_t will clamp if F is too large.
+        float F = StartingVal + static_cast<float>(Increment ? Value : 0);
+        Halfs[Idx] = HLSLHalf_t(F);
+        break;
+      }
+      }
     }
-    return Halfs;
-  }
   }
 
-  DXASSERT(false, "Unable to fill unexpected ComponentType");
-  return std::vector<float>();
+  switch (CompType) {
+    case ComponentType::F32:
+      return Floats;
+    case ComponentType::I32:
+      return Ints;
+    case ComponentType::F16:
+      return Halfs;
+    default:
+      DXASSERT(false, "Unable to fill unexpected ComponentType");
+      return Floats;
+  }
 }
 
 static void logCompiledButSkipping() {
@@ -429,7 +439,7 @@ static void runLoadStoreRoundtrip(ID3D12Device *Device,
     return;
   }
 
-  auto Expected = makeExpected(Params.CompType, NumElements, 1, true);
+  auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1);
 
   // Construct the ShaderOp: two UAV buffers, load from one, store to other.
   auto Op = createComputeOp(LoadStoreShader, Target.c_str(), "UAV(u0), UAV(u1)",
@@ -517,7 +527,7 @@ static void runSplatStore(ID3D12Device *Device,
     return;
   }
 
-  auto Expected = makeExpected(Params.CompType, NumElements, FillValue, false);
+  auto Expected = makeExpected(Params.CompType, Params.M, Params.N, FillValue, false);
 
   auto Op = createComputeOp(SplatStoreShader, Target.c_str(), "UAV(u0)",
                             Args.c_str());
@@ -553,11 +563,13 @@ static const char ElementAccessShader[] = R"(
   RWByteAddressBuffer Output : register(u1);
 
   // flatten the 2D index into a 1D index then scale by element size
+  // Always store row-major and work it out in the test runner
   uint coordToByteOffset(uint2 coord) {
-    return (coord.x * MAJOR_DIM + coord.y) * ELEM_SIZE;
+    return (coord.y * N_DIM + coord.x) * ELEM_SIZE;
   }
 
 #ifndef EMULATE_TEST
+  [WaveSize(4, 64)]
   [numthreads(NUMTHREADS, 1, 1)]
   void main(uint threadIndex : SV_GroupIndex) {
     __builtin_LinAlgMatrix
@@ -605,8 +617,7 @@ static void runElementAccess(ID3D12Device *Device,
   const size_t NumThreads = Params.NumThreads;
   const size_t InputBufSize = Params.totalBytes();
   const size_t ElementSize = elementSize(Params.CompType);
-  const size_t MajorDim =
-      Params.Layout == LinalgMatrixLayout::RowMajor ? Params.M : Params.N;
+
   // Output: ElementSize bytes per element
   //   1 element for each mat idx
   //   1 uint for each thread's length
@@ -618,7 +629,6 @@ static void runElementAccess(ID3D12Device *Device,
     Target = "cs_6_8";
 
   std::stringstream ExtraDefs;
-  ExtraDefs << " -DMAJOR_DIM=" << MajorDim;
   std::string Args = buildCompilerArgs(Params, ExtraDefs.str().c_str());
 
   compileShader(DxcSupport, ElementAccessShader, Target.c_str(), Args, Verbose);
@@ -628,7 +638,7 @@ static void runElementAccess(ID3D12Device *Device,
     return;
   }
 
-  auto Expected = makeExpected(Params.CompType, NumElements, 1, true);
+  auto Expected = makeExpected(Params.CompType, Params.M, Params.N, 1);
 
   auto Op = createComputeOp(ElementAccessShader, Target.c_str(),
                             "UAV(u0), UAV(u1)", Args.c_str());
@@ -674,7 +684,7 @@ void DxilConf_SM610_LinAlg::ElementAccess_Wave_16x16_F16() {
   Params.Use = MatrixUse::Accumulator;
   Params.Scope = MatrixScope::Wave;
   Params.Layout = LinalgMatrixLayout::RowMajor;
-  Params.NumThreads = 4;
+  Params.NumThreads = 64;
   Params.Enable16Bit = true;
   Params.EmulateTest = EmulateTest;
   runElementAccess(D3DDevice, DxcSupport, Params, VerboseLogging, CompileOnly);

From db5c4dabc354cc0e383ce8cf2fbbb5da487ad390 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 9 Apr 2026 22:02:33 +0000
Subject: [PATCH 2/6] chore: autopublish 2026-04-09T22:02:33Z

---
 .../clang/unittests/HLSLExec/LinAlgTests.cpp  | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
index 46b4dc15b3..1b8dbe8800 100644
--- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
+++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
@@ -228,8 +228,10 @@ static bool fillInputBuffer(LPCSTR Name, std::vector<BYTE> &Data,
   return false;
 }
 
-static VariantCompType makeExpected(ComponentType CompType, int32_t M, int32_t N,
-                                    float StartingVal, bool Increment = true, bool Transpose = false) {
+static VariantCompType makeExpected(ComponentType CompType, int32_t M,
+                                    int32_t N, float StartingVal,
+                                    bool Increment = true,
+                                    bool Transpose = false) {
   int32_t NumElements = M * N;
   std::vector<float> Floats(NumElements);
   std::vector<int32_t> Ints(NumElements);
@@ -249,7 +251,8 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M, int32_t N
         Ints[Idx] = static_cast<int32_t>(StartingVal) + (Increment ? Value : 0);
         break;
       case ComponentType::F16: {
-        // Downcasting is safe here since HLSLHalf_t will clamp if F is too large.
+        // Downcasting is safe here since HLSLHalf_t will clamp if F is too
+        // large.
         float F = StartingVal + static_cast<float>(Increment ? Value : 0);
         Halfs[Idx] = HLSLHalf_t(F);
         break;
@@ -259,15 +262,15 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M, int32_t N
   }
 
   switch (CompType) {
-    case ComponentType::F32:
-      return Floats;
-    case ComponentType::I32:
-      return Ints;
-    case ComponentType::F16:
-      return Halfs;
-    default:
-      DXASSERT(false, "Unable to fill unexpected ComponentType");
-      return Floats;
+  case ComponentType::F32:
+    return Floats;
+  case ComponentType::I32:
+    return Ints;
+  case ComponentType::F16:
+    return Halfs;
+  default:
+    DXASSERT(false, "Unable to fill unexpected ComponentType");
+    return Floats;
   }
 }
 
@@ -527,7 +530,8 @@ static void runSplatStore(ID3D12Device *Device,
     return;
   }
 
-  auto Expected = makeExpected(Params.CompType, Params.M, Params.N, FillValue, false);
+  auto Expected =
+      makeExpected(Params.CompType, Params.M, Params.N, FillValue, false);
 
   auto Op = createComputeOp(SplatStoreShader, Target.c_str(), "UAV(u0)",
                             Args.c_str());

From 9cc7e5464d4a600d6e1c0eb0afa94ffbedcb714b Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Thu, 9 Apr 2026 16:34:23 -0600
Subject: [PATCH 3/6] address comments

---
 tools/clang/unittests/HLSLExec/LinAlgTests.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
index 1b8dbe8800..fe068093ae 100644
--- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
+++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
@@ -238,7 +238,7 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M,
   std::vector<HLSLHalf_t> Halfs(NumElements);
 
   for (int32_t I = 0; I < M; ++I) {
-    for (int32_t J = 0; J < M; ++J) {
+    for (int32_t J = 0; J < N; ++J) {
       int32_t Value = I * M + J;
       int32_t Idx = Transpose ? J * N + I : Value;
       switch (CompType) {
@@ -397,6 +397,7 @@ static const char LoadStoreShader[] = R"(
   RWByteAddressBuffer Output : register(u1);
 
 #ifndef EMULATE_TEST
+  [WaveSize(4, 64)]
   [numthreads(NUMTHREADS, 1, 1)]
   void main() {
     __builtin_LinAlgMatrix
@@ -476,7 +477,7 @@ void DxilConf_SM610_LinAlg::LoadStoreRoundtrip_Wave_16x16_F16() {
   Params.Use = MatrixUse::A;
   Params.Scope = MatrixScope::Wave;
   Params.Layout = LinalgMatrixLayout::RowMajor;
-  Params.NumThreads = 4;
+  Params.NumThreads = 64;
   Params.Enable16Bit = true;
   Params.EmulateTest = EmulateTest;
   runLoadStoreRoundtrip(D3DDevice, DxcSupport, Params, VerboseLogging,
@@ -487,6 +488,7 @@ static const char SplatStoreShader[] = R"(
   RWByteAddressBuffer Output : register(u0);
 
 #ifndef EMULATE_TEST
+  [WaveSize(4, 64)]
   [numthreads(NUMTHREADS, 1, 1)]
   void main() {
     __builtin_LinAlgMatrix
@@ -555,7 +557,7 @@ void DxilConf_SM610_LinAlg::SplatStore_Wave_16x16_F16() {
   Params.Use = MatrixUse::Accumulator;
   Params.Scope = MatrixScope::Wave;
   Params.Layout = LinalgMatrixLayout::RowMajor;
-  Params.NumThreads = 4;
+  Params.NumThreads = 64;
   Params.Enable16Bit = true;
   Params.EmulateTest = EmulateTest;
   runSplatStore(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging,

From 45bf58893a9d8f44eb03a2815b7d1da8664c7469 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Thu, 9 Apr 2026 17:49:56 -0600
Subject: [PATCH 4/6] Address comments

---
 .../clang/unittests/HLSLExec/LinAlgTests.cpp  | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
index fe068093ae..44981cd06f 100644
--- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
+++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
@@ -232,23 +232,25 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M,
                                     int32_t N, float StartingVal,
                                     bool Increment = true,
                                     bool Transpose = false) {
-  int32_t NumElements = M * N;
+  const size_t NumElements = M * N;
   std::vector<float> Floats(NumElements);
   std::vector<int32_t> Ints(NumElements);
   std::vector<HLSLHalf_t> Halfs(NumElements);
 
-  for (int32_t I = 0; I < M; ++I) {
-    for (int32_t J = 0; J < N; ++J) {
-      int32_t Value = I * M + J;
-      int32_t Idx = Transpose ? J * N + I : Value;
+  for (size_t I = 0; I < M; ++I) {
+    for (size_t J = 0; J < N; ++J) {
+      size_t Value = I * M + J;
+      size_t Idx = Transpose ? J * N + I : Value;
       switch (CompType) {
       case ComponentType::F32:
         Floats[Idx] = StartingVal + static_cast<float>(Increment ? Value : 0);
         break;
       case ComponentType::I32:
-        DXASSERT(StartingVal < static_cast<float>(INT_MAX),
+        VERIFY_IS_TRUE(StartingVal < static_cast<float>(std::numeric_limits<int32_t>::max()),
                  "Value too large to cast to int32_t");
-        Ints[Idx] = static_cast<int32_t>(StartingVal) + (Increment ? Value : 0);
+        VERIFY_IS_TRUE(StartingVal > static_cast<float>(std::numeric_limits<int32_t>::min()),
+                 "Value too small to cast to int32_t");
+        Ints[Idx] = static_cast<int32_t>(StartingVal) + static_cast<int32_t>(Increment ? Value : 0);
         break;
       case ComponentType::F16: {
         // Downcasting is safe here since HLSLHalf_t will clamp if F is too
@@ -257,6 +259,8 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M,
         Halfs[Idx] = HLSLHalf_t(F);
         break;
       }
+      default:
+        VERIFY_IS_TRUE(false, "Unable to fill unexpected ComponentType");
       }
     }
   }
@@ -269,7 +273,7 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M,
   case ComponentType::F16:
     return Halfs;
   default:
-    DXASSERT(false, "Unable to fill unexpected ComponentType");
+    VERIFY_IS_TRUE(false, "Unable to fill unexpected ComponentType");
     return Floats;
   }
 }

From 41ae95cce6fb37b447b332e7ef184dc9be0e4d32 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Thu, 9 Apr 2026 17:52:56 -0600
Subject: [PATCH 5/6] missing break

---
 tools/clang/unittests/HLSLExec/LinAlgTests.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
index 44981cd06f..4146da0cb2 100644
--- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
+++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
@@ -261,6 +261,7 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M,
       }
       default:
         VERIFY_IS_TRUE(false, "Unable to fill unexpected ComponentType");
+        break;
       }
     }
   }

From 6906edbd39ded993618c3093d8a50d42bf5fc5df Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Thu, 9 Apr 2026 23:57:40 +0000
Subject: [PATCH 6/6] chore: autopublish 2026-04-09T23:57:39Z

---
 tools/clang/unittests/HLSLExec/LinAlgTests.cpp | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
index 4146da0cb2..6eb637cdcd 100644
--- a/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
+++ b/tools/clang/unittests/HLSLExec/LinAlgTests.cpp
@@ -246,11 +246,14 @@ static VariantCompType makeExpected(ComponentType CompType, int32_t M,
         Floats[Idx] = StartingVal + static_cast<float>(Increment ? Value : 0);
         break;
       case ComponentType::I32:
-        VERIFY_IS_TRUE(StartingVal < static_cast<float>(std::numeric_limits<int32_t>::max()),
-                 "Value too large to cast to int32_t");
-        VERIFY_IS_TRUE(StartingVal > static_cast<float>(std::numeric_limits<int32_t>::min()),
-                 "Value too small to cast to int32_t");
-        Ints[Idx] = static_cast<int32_t>(StartingVal) + static_cast<int32_t>(Increment ? Value : 0);
+        VERIFY_IS_TRUE(StartingVal < static_cast<float>(
+                                         std::numeric_limits<int32_t>::max()),
+                       "Value too large to cast to int32_t");
+        VERIFY_IS_TRUE(StartingVal > static_cast<float>(
+                                         std::numeric_limits<int32_t>::min()),
+                       "Value too small to cast to int32_t");
+        Ints[Idx] = static_cast<int32_t>(StartingVal) +
+                    static_cast<int32_t>(Increment ? Value : 0);
         break;
       case ComponentType::F16: {
         // Downcasting is safe here since HLSLHalf_t will clamp if F is too