Treat matrix load/stores as column major in raw buffers (#4526)

kuhar · web-flow · commit 24ea9e83d297 · 2022-06-24T17:29:49.000-04:00
Assume that matrices are stored in the column major order in raw buffers, e.g., `ByteAddressBuffer` and `RWByteAddressBuffer`. Add a new flag,`-fspv-use-legacy-buffer-matrix-order`, so that shaders that depend on the previous matrix order (row major) can opt-out of this change. Fixes: #3370
diff --git a/docs/SPIR-V.rst b/docs/SPIR-V.rst
@@ -4012,6 +4012,8 @@ codegen for Vulkan:
   the resource arrays must be marked with ``[unroll]``.
 - ``-fspv-entrypoint-name=<name>``: Specify the SPIR-V entry point name. Defaults
   to the HLSL entry point name.
+- ``-fspv-use-legacy-buffer-matrix-order``: Assumes the legacy matrix order (row
+  major) when accessing raw buffers (e.g., ByteAdddressBuffer).
 - ``-Wno-vk-ignored-features``: Does not emit warnings on ignored features
   resulting from no Vulkan support, e.g., cbuffer member initializer.
 
diff --git a/include/dxc/Support/HLSLOptions.td b/include/dxc/Support/HLSLOptions.td
@@ -345,6 +345,8 @@ def fvk_use_dx_layout: Flag<["-"], "fvk-use-dx-layout">, Group<spirv_Group>, Fla
   HelpText<"Use DirectX memory layout for Vulkan resources">;
 def fvk_use_scalar_layout: Flag<["-"], "fvk-use-scalar-layout">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Use scalar memory layout for Vulkan resources">;
+def fspv_use_legacy_buffer_matrix_order: Flag<["-"], "fspv-use-legacy-buffer-matrix-order">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
+  HelpText<"Assume the legacy matrix order (row major) when accessing raw buffers (e.g., ByteAdddressBuffer)">;
 def fspv_reflect: Flag<["-"], "fspv-reflect">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
   HelpText<"Emit additional SPIR-V instructions to aid reflection">;
 def fspv_debug_EQ : Joined<["-"], "fspv-debug=">, Group<spirv_Group>, Flags<[CoreOption, DriverOption]>,
diff --git a/include/dxc/Support/SPIRVOptions.h b/include/dxc/Support/SPIRVOptions.h
@@ -56,6 +56,7 @@ struct SpirvCodeGenOptions {
   bool noWarnIgnoredFeatures;
   bool useDxLayout;
   bool useGlLayout;
+  bool useLegacyBufferMatrixOrder;
   bool useScalarLayout;
   bool flattenResourceArrays;
   bool reduceLoadSize;
diff --git a/lib/DxcSupport/HLSLOptions.cpp b/lib/DxcSupport/HLSLOptions.cpp
@@ -976,6 +976,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
   opts.SpirvOptions.useGlLayout = Args.hasFlag(OPT_fvk_use_gl_layout, OPT_INVALID, false);
   opts.SpirvOptions.useDxLayout = Args.hasFlag(OPT_fvk_use_dx_layout, OPT_INVALID, false);
   opts.SpirvOptions.useScalarLayout = Args.hasFlag(OPT_fvk_use_scalar_layout, OPT_INVALID, false);
+  opts.SpirvOptions.useLegacyBufferMatrixOrder = Args.hasFlag(OPT_fspv_use_legacy_buffer_matrix_order, OPT_INVALID, false);
   opts.SpirvOptions.enableReflect = Args.hasFlag(OPT_fspv_reflect, OPT_INVALID, false);
   opts.SpirvOptions.noWarnIgnoredFeatures = Args.hasFlag(OPT_Wno_vk_ignored_features, OPT_INVALID, false);
   opts.SpirvOptions.noWarnEmulatedFeatures = Args.hasFlag(OPT_Wno_vk_emulated_features, OPT_INVALID, false);
@@ -1104,6 +1105,7 @@ int ReadDxcOpts(const OptTable *optionTable, unsigned flagsToInclude,
       Args.hasFlag(OPT_fvk_use_gl_layout, OPT_INVALID, false) ||
       Args.hasFlag(OPT_fvk_use_dx_layout, OPT_INVALID, false) ||
       Args.hasFlag(OPT_fvk_use_scalar_layout, OPT_INVALID, false) ||
+      Args.hasFlag(OPT_fspv_use_legacy_buffer_matrix_order, OPT_INVALID, false) ||
       Args.hasFlag(OPT_fspv_flatten_resource_arrays, OPT_INVALID, false) ||
       Args.hasFlag(OPT_fspv_reduce_load_size, OPT_INVALID, false) ||
       Args.hasFlag(OPT_fspv_reflect, OPT_INVALID, false) ||
diff --git a/tools/clang/lib/SPIRV/RawBufferMethods.cpp b/tools/clang/lib/SPIRV/RawBufferMethods.cpp
@@ -11,9 +11,11 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/RecordLayout.h"
+#include "clang/AST/Type.h"
 #include "clang/SPIRV/AstTypeProbe.h"
 #include "clang/SPIRV/SpirvBuilder.h"
 #include "clang/SPIRV/SpirvInstruction.h"
+#include <cstdint>
 
 namespace {
 /// Rounds the given value up to the given power of 2.
@@ -290,22 +292,43 @@ SpirvInstruction *RawBufferHandler::processTemplatedLoadFromBuffer(
   // Matrix types
   {
     QualType elemType = {};
-    uint32_t numRows = 0, numCols = 0;
+    uint32_t numRows = 0;
+    uint32_t numCols = 0;
     if (isMxNMatrix(targetType, &elemType, &numRows, &numCols)) {
-      llvm::SmallVector<SpirvInstruction *, 4> loadedElems;
+      // In DX, the default matrix orientation in ByteAddressBuffer is column
+      // major. If HLSL/DXIL support the `column_major` and `row_major`
+      // attributes in the future, we will have to check for them here and
+      // override the behavior.
+      //
+      // The assume buffer matrix order is controlled by the
+      // `-fspv-use-legacy-buffer-matrix-order` flag:
+      //   (a) false --> assume the matrix is stored column major
+      //   (b) true  --> assume the matrix is stored row major
+      //
+      // We provide (b) for compatibility with legacy shaders that depend on
+      // the previous, incorrect, raw buffer matrix order assumed by the SPIR-V
+      // codegen.
+      const bool isBufferColumnMajor =
+          !theEmitter.getSpirvOptions().useLegacyBufferMatrixOrder;
+      const uint32_t numElements = numRows * numCols;
+      llvm::SmallVector<SpirvInstruction *, 16> loadedElems(numElements);
+      for (uint32_t i = 0; i != numElements; ++i)
+        loadedElems[i] = processTemplatedLoadFromBuffer(buffer, index, elemType,
+                                                        bitOffset, range);
+
       llvm::SmallVector<SpirvInstruction *, 4> loadedRows;
       for (uint32_t i = 0; i < numRows; ++i) {
+        llvm::SmallVector<SpirvInstruction *, 4> loadedColumn;
         for (uint32_t j = 0; j < numCols; ++j) {
-          // TODO: This is currently doing a row_major matrix load. We must
-          // investigate whether we also need to implement it for column_major.
-          loadedElems.push_back(processTemplatedLoadFromBuffer(
-              buffer, index, elemType, bitOffset, range));
+          const uint32_t elementIndex =
+              isBufferColumnMajor ? (j * numRows + i) : (i * numCols + j);
+          loadedColumn.push_back(loadedElems[elementIndex]);
         }
         const auto rowType = astContext.getExtVectorType(elemType, numCols);
         loadedRows.push_back(spvBuilder.createCompositeConstruct(
-            rowType, loadedElems, loc, range));
-        loadedElems.clear();
+            rowType, loadedColumn, loc, range));
       }
+
       result = spvBuilder.createCompositeConstruct(targetType, loadedRows, loc,
                                                    range);
       result->setRValue();
@@ -593,14 +616,28 @@ QualType RawBufferHandler::serializeToScalarsOrStruct(
     QualType elemType = {};
     uint32_t numRows = 0, numCols = 0;
     if (isMxNMatrix(valueType, &elemType, &numRows, &numCols)) {
+      // Check if the destination buffer expects matrices in column major or row
+      // major order. In the future, we may also need to consider the
+      // `row_major` and `column_major` attribures. This is not handled by
+      // HLSL/DXIL at the moment, so we ignore them too.
+      const bool isBufferColumnMajor =
+          !theEmitter.getSpirvOptions().useLegacyBufferMatrixOrder;
       for (uint32_t i = 0; i < size; ++i) {
-        for (uint32_t j = 0; j < numRows; ++j) {
-          for (uint32_t k = 0; k < numCols; ++k) {
-            // TODO: This is currently doing a row_major matrix store. We must
-            // investigate whether we also need to implement it for
-            // column_major.
-            values->push_back(spvBuilder.createCompositeExtract(
-                elemType, values->front(), {j, k}, loc, range));
+        if (isBufferColumnMajor) {
+          // Access the matrix in the column major order.
+          for (uint32_t j = 0; j != numCols; ++j) {
+            for (uint32_t k = 0; k != numRows; ++k) {
+              values->push_back(spvBuilder.createCompositeExtract(
+                  elemType, values->front(), {k, j}, loc, range));
+            }
+          }
+        } else {
+          // Access the matrix in the row major order.
+          for (uint32_t j = 0; j != numRows; ++j) {
+            for (uint32_t k = 0; k != numCols; ++k) {
+              values->push_back(spvBuilder.createCompositeExtract(
+                  elemType, values->front(), {j, k}, loc, range));
+            }
           }
         }
         values->pop_front();
diff --git a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.matrix.hlsl b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-load.matrix.hlsl
@@ -20,7 +20,6 @@ void main(uint3 tid : SV_DispatchThreadId)
 // CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
 // CHECK:         [[word1:%\d+]] = OpLoad %uint [[ptr]]
 // CHECK:          [[val2:%\d+]] = OpUConvert %ushort [[word1]]
-// CHECK:          [[row0:%\d+]] = OpCompositeConstruct %v3ushort [[val0]] [[val1]] [[val2]]
 // CHECK:           [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
 // CHECK:         [[word1:%\d+]] = OpLoad %uint [[ptr]]
 // CHECK: [[shifted_word1:%\d+]] = OpShiftRightLogical %uint [[word1]] %uint_16
@@ -33,7 +32,8 @@ void main(uint3 tid : SV_DispatchThreadId)
 // CHECK:         [[word2:%\d+]] = OpLoad %uint [[ptr]]
 // CHECK: [[shifted_word2:%\d+]] = OpShiftRightLogical %uint [[word2]] %uint_16
 // CHECK:          [[val5:%\d+]] = OpUConvert %ushort [[shifted_word2:%\d+]]
-// CHECK:          [[row1:%\d+]] = OpCompositeConstruct %v3ushort [[val3]] [[val4]] [[val5]]
+// CHECK:          [[row0:%\d+]] = OpCompositeConstruct %v3ushort [[val0]] [[val2]] [[val4]]
+// CHECK:          [[row1:%\d+]] = OpCompositeConstruct %v3ushort [[val1]] [[val3]] [[val5]]
 // CHECK:        [[matrix:%\d+]] = OpCompositeConstruct %_arr_v3ushort_uint_2 [[row0]] [[row1]]
 // CHECK:                          OpStore %u16 [[matrix]]
   uint16_t2x3 u16 = buf.Load<uint16_t2x3>(tid.x);
@@ -48,18 +48,27 @@ void main(uint3 tid : SV_DispatchThreadId)
 // CHECK:  [[word1:%\d+]] = OpLoad %uint [[ptr]]
 // CHECK:   [[val1:%\d+]] = OpBitcast %int [[word1:%\d+]]
 // CHECK:[[index_2:%\d+]] = OpIAdd %uint [[index_1]] %uint_1
-// CHECK:   [[row0:%\d+]] = OpCompositeConstruct %v2int [[val0]] [[val1]]
 // CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2]]
 // CHECK:  [[word2:%\d+]] = OpLoad %uint [[ptr]]
 // CHECK:   [[val2:%\d+]] = OpBitcast %int [[word2]]
 // CHECK:[[index_3:%\d+]] = OpIAdd %uint [[index_2]] %uint_1
 // CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
 // CHECK:  [[word3:%\d+]] = OpLoad %uint [[ptr]]
 // CHECK:   [[val3:%\d+]] = OpBitcast %int [[word3]]
-// CHECK:   [[row1:%\d+]] = OpCompositeConstruct %v2int [[val2]] [[val3]]
-// CHECK: [[matrix:%\d+]] = OpCompositeConstruct %_arr_v2int_uint_2 [[row0]] [[row1]]
-// CHECK:                   OpStore %i [[matrix]]
-  int2x2 i = buf.Load<int2x2>(tid.x);
+// CHECK:[[index_4:%\d+]] = OpIAdd %uint [[index_3]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_4]]
+// CHECK:  [[word4:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:   [[val4:%\d+]] = OpBitcast %int [[word4]]
+// CHECK:[[index_5:%\d+]] = OpIAdd %uint [[index_4]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_5]]
+// CHECK:  [[word5:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:   [[val5:%\d+]] = OpBitcast %int [[word5]]
+// CHECK:   [[row0:%\d+]] = OpCompositeConstruct %v2int [[val0]] [[val3]]
+// CHECK:   [[row1:%\d+]] = OpCompositeConstruct %v2int [[val1]] [[val4]]
+// CHECK:   [[row2:%\d+]] = OpCompositeConstruct %v2int [[val2]] [[val5]]
+// CHECK: [[matrix:%\d+]] = OpCompositeConstruct %_arr_v2int_uint_3 [[row0]] [[row1]] [[row2]]
+// CHECK:                   OpStore %j [[matrix]]
+  int3x2 j = buf.Load<int3x2>(tid.x);
 
 // ********* 64-bit matrix ********************
 
@@ -85,7 +94,6 @@ void main(uint3 tid : SV_DispatchThreadId)
 // CHECK:          [[val1_ulong:%\d+]] = OpBitwiseOr %ulong [[word2_ulong]] [[word3_ulong_shifted]]
 // CHECK:                [[val1:%\d+]] = OpBitcast %double [[val1_ulong]]
 // CHECK:             [[index_4:%\d+]] = OpIAdd %uint [[index_3]] %uint_1
-// CHECK:                [[row0:%\d+]] = OpCompositeConstruct %v2double [[val0]] [[val1]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_4]]
 // CHECK:               [[word4:%\d+]] = OpLoad %uint [[ptr]]
 // CHECK:             [[index_5:%\d+]] = OpIAdd %uint [[index_4]] %uint_1
@@ -107,7 +115,8 @@ void main(uint3 tid : SV_DispatchThreadId)
 // CHECK: [[word7_ulong_shifted:%\d+]] = OpShiftLeftLogical %ulong [[word7_ulong]] %uint_32
 // CHECK:          [[val3_ulong:%\d+]] = OpBitwiseOr %ulong [[word6_ulong]] [[word7_ulong_shifted]]
 // CHECK:                [[val3:%\d+]] = OpBitcast %double [[val3_ulong]]
-// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v2double [[val2]] [[val3]]
+// CHECK:                [[row0:%\d+]] = OpCompositeConstruct %v2double [[val0]] [[val2]]
+// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v2double [[val1]] [[val3]]
 // CHECK:              [[matrix:%\d+]] = OpCompositeConstruct %mat2v2double [[row0]] [[row1]]
 // CHECK:                                OpStore %f64 [[matrix]]
   float64_t2x2 f64 = buf.Load<float64_t2x2>(tid.x);
@@ -118,35 +127,35 @@ void main(uint3 tid : SV_DispatchThreadId)
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_0]]
 // CHECK:             [[index_1:%\d+]] = OpIAdd %uint [[index_0]] %uint_1
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
-// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
 // CHECK:             [[index_2:%\d+]] = OpIAdd %uint [[index_1]] %uint_1
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2]]
 // CHECK:             [[index_3:%\d+]] = OpIAdd %uint [[index_2]] %uint_1
+// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:                [[row2:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:            [[matrix_1:%\d+]] = OpCompositeConstruct %mat2v3half [[row1]] [[row2]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
 // CHECK:             [[index_4:%\d+]] = OpIAdd %uint [[index_3]] %uint_1
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_4]]
-// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_4]]
 // CHECK:             [[index_5:%\d+]] = OpIAdd %uint [[index_4]] %uint_1
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_5]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_5]]
 // CHECK:             [[index_6:%\d+]] = OpIAdd %uint [[index_5]] %uint_1
+// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:                [[row2:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:            [[matrix_2:%\d+]] = OpCompositeConstruct %mat2v3half [[row1]] [[row2]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_6]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_6]]
 // CHECK:             [[index_7:%\d+]] = OpIAdd %uint [[index_6]] %uint_1
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_7]]
-// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_7]]
 // CHECK:             [[index_8:%\d+]] = OpIAdd %uint [[index_7]] %uint_1
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_8]]
 // CHECK:                 [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_8]]
+// CHECK:                [[row1:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:                [[row2:%\d+]] = OpCompositeConstruct %v3half
 // CHECK:            [[matrix_3:%\d+]] = OpCompositeConstruct %mat2v3half [[row1]] [[row2]]
 // CHECK:        [[matrix_array:%\d+]] = OpCompositeConstruct %_arr_mat2v3half_uint_3 [[matrix_1]] [[matrix_2]] [[matrix_3]]
diff --git a/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.matrix.hlsl b/tools/clang/test/CodeGenSPIRV/method.byte-address-buffer.templated-store.matrix.hlsl
@@ -0,0 +1,53 @@
+// RUN: %dxc -T cs_6_2 -E main
+//
+// In this test, check that matrix order is preserved on a templated store.
+
+ByteAddressBuffer buf;
+RWByteAddressBuffer buf2;
+
+[numthreads(64, 1, 1)]
+void main(uint3 tid : SV_DispatchThreadId)
+{
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_0:%\d+]]
+// CHECK:  [[word0:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:   [[val0:%\d+]] = OpBitcast %int [[word0]]
+// CHECK:[[index_1:%\d+]] = OpIAdd %uint [[index_0]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_1]]
+// CHECK:  [[word1:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:   [[val1:%\d+]] = OpBitcast %int [[word1:%\d+]]
+// CHECK:[[index_2:%\d+]] = OpIAdd %uint [[index_1]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_2]]
+// CHECK:  [[word2:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:   [[val2:%\d+]] = OpBitcast %int [[word2]]
+// CHECK:[[index_3:%\d+]] = OpIAdd %uint [[index_2]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf %uint_0 [[index_3]]
+// CHECK:  [[word3:%\d+]] = OpLoad %uint [[ptr]]
+// CHECK:   [[val3:%\d+]] = OpBitcast %int [[word3]]
+// CHECK:   [[row0:%\d+]] = OpCompositeConstruct %v2int [[val0]] [[val2]]
+// CHECK:   [[row1:%\d+]] = OpCompositeConstruct %v2int [[val1]] [[val3]]
+// CHECK:   [[mat0:%\d+]] = OpCompositeConstruct %_arr_v2int_uint_2 [[row0]] [[row1]]
+// CHECK:                   OpStore [[temp:%\w+]] [[mat0]]
+// CHECK:   [[mat1:%\d+]] = OpLoad %_arr_v2int_uint_2 [[temp]]
+// CHECK:  [[elem0:%\d+]] = OpCompositeExtract %int [[mat1]] 0 0
+// CHECK:  [[elem1:%\d+]] = OpCompositeExtract %int [[mat1]] 1 0
+// CHECK:  [[elem2:%\d+]] = OpCompositeExtract %int [[mat1]] 0 1
+// CHECK:  [[elem3:%\d+]] = OpCompositeExtract %int [[mat1]] 1 1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[idx0:%\d+]]
+// CHECK:    [[val:%\d+]] = OpBitcast %uint [[elem0]]
+// CHECK:                   OpStore [[ptr]] [[val]]
+// CHECK:   [[idx1:%\d+]] = OpIAdd %uint [[idx0]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[idx1]]
+// CHECK:    [[val:%\d+]] = OpBitcast %uint [[elem1]]
+// CHECK:                   OpStore [[ptr]] [[val]]
+// CHECK:   [[idx2:%\d+]] = OpIAdd %uint [[idx1]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[idx2]]
+// CHECK:    [[val:%\d+]] = OpBitcast %uint [[elem2]]
+// CHECK:                   OpStore [[ptr]] [[val]]
+// CHECK:   [[idx3:%\d+]] = OpIAdd %uint [[idx2]] %uint_1
+// CHECK:    [[ptr:%\d+]] = OpAccessChain %_ptr_Uniform_uint %buf2 %uint_0 [[idx3]]
+// CHECK:    [[val:%\d+]] = OpBitcast %uint [[elem3]]
+// CHECK:                   OpStore [[ptr]] [[val]]
+
+  int2x2 i = buf.Load<int2x2>(tid.x);
+  buf2.Store<int2x2>(tid.x, i);
+}
diff --git a/tools/clang/test/CodeGenSPIRV/spv.use-legacy-buffer-matrix-order.hlsl b/tools/clang/test/CodeGenSPIRV/spv.use-legacy-buffer-matrix-order.hlsl
diff --git a/tools/clang/unittests/SPIRV/CodeGenSpirvTest.cpp b/tools/clang/unittests/SPIRV/CodeGenSpirvTest.cpp