From e520d0a5d052788c42ed9a5b8567dd9e52fb779a Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Thu, 19 Feb 2026 15:37:50 -0700
Subject: [PATCH 1/7] [SM6.10] Implement groupshared Builtins

Implements the Load/Store/Accumulate to memory groupshared
builtins following the pattern of the previous builtins
---
 include/dxc/DXIL/DxilInstructions.h           | 18 +++---
 include/dxc/DXIL/DxilOperations.h             |  1 +
 lib/DXIL/DxilOperations.cpp                   | 46 ++++++++++-----
 lib/HLSL/HLOperationLower.cpp                 | 56 ++++++++++++++++++-
 .../matrixaccumulatetomemory/nominal.hlsl     | 19 +++++++
 .../matrixloadfrommemory/nominal.hlsl         | 19 +++++++
 .../builtins/matrixstoretomemory/nominal.hlsl | 19 +++++++
 .../matrixaccumulatetomemory/ast.hlsl         | 24 ++++++++
 .../unavailable_pre_sm610.hlsl                | 15 +++++
 .../builtins/matrixloadfrommemory/ast.hlsl    | 24 ++++++++
 .../unavailable_pre_sm610.hlsl                | 15 +++++
 .../builtins/matrixstoretomemory/ast.hlsl     | 24 ++++++++
 .../unavailable_pre_sm610.hlsl                | 15 +++++
 .../hlsl/linalg/builtins/stage-errors.hlsl    | 16 ++++++
 utils/hct/gen_intrin_main.txt                 |  6 +-
 utils/hct/hctdb.py                            | 20 +++----
 16 files changed, 297 insertions(+), 40 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl
 create mode 100644 tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl

diff --git a/include/dxc/DXIL/DxilInstructions.h b/include/dxc/DXIL/DxilInstructions.h
index 8c48202ce0..941eab6474 100644
--- a/include/dxc/DXIL/DxilInstructions.h
+++ b/include/dxc/DXIL/DxilInstructions.h
@@ -10651,14 +10651,14 @@ struct DxilInst_LinAlgMatrixLoadFromMemory {
   bool requiresUniformInputs() const { return false; }
   // Operand indexes
   enum OperandIdx {
-    arg_groupsharedArr = 1,
+    arg_memory = 1,
     arg_offset = 2,
     arg_stride = 3,
     arg_layout = 4,
   };
   // Accessors
-  llvm::Value *get_groupsharedArr() const { return Instr->getOperand(1); }
-  void set_groupsharedArr(llvm::Value *val) { Instr->setOperand(1, val); }
+  llvm::Value *get_memory() const { return Instr->getOperand(1); }
+  void set_memory(llvm::Value *val) { Instr->setOperand(1, val); }
   llvm::Value *get_offset() const { return Instr->getOperand(2); }
   void set_offset(llvm::Value *val) { Instr->setOperand(2, val); }
   llvm::Value *get_stride() const { return Instr->getOperand(3); }
@@ -10854,7 +10854,7 @@ struct DxilInst_LinAlgMatrixStoreToMemory {
   // Operand indexes
   enum OperandIdx {
     arg_matrix = 1,
-    arg_groupsharedArr = 2,
+    arg_memory = 2,
     arg_offset = 3,
     arg_stride = 4,
     arg_layout = 5,
@@ -10862,8 +10862,8 @@ struct DxilInst_LinAlgMatrixStoreToMemory {
   // Accessors
   llvm::Value *get_matrix() const { return Instr->getOperand(1); }
   void set_matrix(llvm::Value *val) { Instr->setOperand(1, val); }
-  llvm::Value *get_groupsharedArr() const { return Instr->getOperand(2); }
-  void set_groupsharedArr(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_memory() const { return Instr->getOperand(2); }
+  void set_memory(llvm::Value *val) { Instr->setOperand(2, val); }
   llvm::Value *get_offset() const { return Instr->getOperand(3); }
   void set_offset(llvm::Value *val) { Instr->setOperand(3, val); }
   llvm::Value *get_stride() const { return Instr->getOperand(4); }
@@ -11091,7 +11091,7 @@ struct DxilInst_LinAlgMatrixAccumulateToMemory {
   // Operand indexes
   enum OperandIdx {
     arg_matrix = 1,
-    arg_groupsharedArr = 2,
+    arg_memory = 2,
     arg_offset = 3,
     arg_stride = 4,
     arg_layout = 5,
@@ -11099,8 +11099,8 @@ struct DxilInst_LinAlgMatrixAccumulateToMemory {
   // Accessors
   llvm::Value *get_matrix() const { return Instr->getOperand(1); }
   void set_matrix(llvm::Value *val) { Instr->setOperand(1, val); }
-  llvm::Value *get_groupsharedArr() const { return Instr->getOperand(2); }
-  void set_groupsharedArr(llvm::Value *val) { Instr->setOperand(2, val); }
+  llvm::Value *get_memory() const { return Instr->getOperand(2); }
+  void set_memory(llvm::Value *val) { Instr->setOperand(2, val); }
   llvm::Value *get_offset() const { return Instr->getOperand(3); }
   void set_offset(llvm::Value *val) { Instr->setOperand(3, val); }
   llvm::Value *get_stride() const { return Instr->getOperand(4); }
diff --git a/include/dxc/DXIL/DxilOperations.h b/include/dxc/DXIL/DxilOperations.h
index bab4bffc6e..85df375b3a 100644
--- a/include/dxc/DXIL/DxilOperations.h
+++ b/include/dxc/DXIL/DxilOperations.h
@@ -212,6 +212,7 @@ class OP {
     TS_UDT = 8,      // Ex: %"struct.MyStruct" *
     TS_Object = 9,   // Ex: %"class.StructuredBuffer<Foo>"
     TS_Vector = 10,  // Ex: <8 x i16>
+    TS_Array = 11,   // Ex: [8 x float]
     TS_MaskBitCount, // Types used in Mask end here
     // TS_Extended is only used to identify the unnamed struct type used to wrap
     // multiple overloads when using GetTypeSlot.
diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index 4138b3d930..02dcfe65a0 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -2863,8 +2863,8 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      "linAlgMatrixLoadFromMemory",
      Attribute::None,
      2,
-     {{0x200}, {0x63}},
-     {{0x0}, {0x0}}}, // Overloads: o,hfwi
+     {{0x200}, {0x800}},
+     {{0x0}, {0x0}}}, // Overloads: o,a
     {OC::LinAlgMatrixLength,
      "LinAlgMatrixLength",
      OCC::LinAlgMatrixLength,
@@ -2911,8 +2911,8 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      "linAlgMatrixStoreToMemory",
      Attribute::None,
      2,
-     {{0x200}, {0x63}},
-     {{0x0}, {0x0}}}, // Overloads: o,hfwi
+     {{0x200}, {0x800}},
+     {{0x0}, {0x0}}}, // Overloads: o,a
     {OC::LinAlgMatrixQueryAccumulatorLayout,
      "LinAlgMatrixQueryAccumulatorLayout",
      OCC::LinAlgMatrixQueryAccumulatorLayout,
@@ -2967,8 +2967,8 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      "linAlgMatrixAccumulateToMemory",
      Attribute::None,
      2,
-     {{0x200}, {0x63}},
-     {{0x0}, {0x0}}}, // Overloads: o,hfwi
+     {{0x200}, {0x800}},
+     {{0x0}, {0x0}}}, // Overloads: o,a
     {OC::LinAlgMatrixOuterProduct,
      "LinAlgMatrixOuterProduct",
      OCC::LinAlgMatrixOuterProduct,
@@ -3152,6 +3152,8 @@ unsigned OP::GetTypeSlot(Type *pType) {
       return TS_Extended;
   case Type::VectorTyID:
     return TS_Vector;
+  case Type::ArrayTyID:
+    return TS_Array;
   default:
     break;
   }
@@ -3166,26 +3168,39 @@ const char *OP::GetOverloadTypeName(unsigned TypeSlot) {
 StringRef OP::GetTypeName(Type *Ty, SmallVectorImpl<char> &Storage) {
   DXASSERT(!Ty->isVoidTy(), "must not pass void type here");
   unsigned TypeSlot = OP::GetTypeSlot(Ty);
+
   if (TypeSlot < TS_BasicCount) {
     return GetOverloadTypeName(TypeSlot);
-  } else if (TypeSlot == TS_UDT) {
+  }
+
+  switch (TypeSlot) {
+  case TS_UDT: {
     if (Ty->isPointerTy())
       Ty = Ty->getPointerElementType();
     StructType *ST = cast<StructType>(Ty);
     return ST->getStructName();
-  } else if (TypeSlot == TS_Object) {
+  }
+  case TS_Object: {
     StructType *ST = cast<StructType>(Ty);
     if (dxilutil::IsHLSLLinAlgMatrixType(Ty))
       return (Twine("m") + Twine(dxilutil::GetHLSLLinAlgMatrixTypeMangling(ST)))
           .toStringRef(Storage);
     return ST->getStructName();
-  } else if (TypeSlot == TS_Vector) {
+  }
+  case TS_Vector: {
     VectorType *VecTy = cast<VectorType>(Ty);
     return (Twine("v") + Twine(VecTy->getNumElements()) +
             Twine(
                 GetOverloadTypeName(OP::GetTypeSlot(VecTy->getElementType()))))
         .toStringRef(Storage);
-  } else if (TypeSlot == TS_Extended) {
+  }
+  case TS_Array: {
+    if (Ty->isPointerTy())
+      Ty = Ty->getPointerElementType();
+    ArrayType *ArrTy = cast<ArrayType>(Ty);
+    return GetOverloadTypeName(OP::GetTypeSlot(ArrTy->getArrayElementType()));
+  }
+  case TS_Extended: {
     DXASSERT(isa<StructType>(Ty),
              "otherwise, extended overload type not wrapped in struct type.");
     StructType *ST = cast<StructType>(Ty);
@@ -3200,11 +3215,14 @@ StringRef OP::GetTypeName(Type *Ty, SmallVectorImpl<char> &Storage) {
       OS << GetTypeName(ST->getElementType(I), TempStr);
     }
     return OS.str();
-  } else {
-    raw_svector_ostream OS(Storage);
-    Ty->print(OS);
-    return OS.str();
   }
+  default:
+    break;
+  }
+
+  raw_svector_ostream OS(Storage);
+  Ty->print(OS);
+  return OS.str();
 }
 
 StringRef OP::ConstructOverloadName(Type *Ty, DXIL::OpCode opCode,
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 4f22a4598d..9ea6166f36 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -7226,6 +7226,53 @@ Value *TranslateLinAlgCopyConvertMatrix(CallInst *CI, IntrinsicOp IOP,
   return nullptr;
 }
 
+Value *TranslateLinAlgMatrixLoadFromMemory(
+    CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+    HLOperationLowerHelper &Helper, HLObjectOperationLowerHelper *ObjHelper,
+    bool &Translated) {
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *MatrixPtr = CI->getArgOperand(1);
+  DXASSERT_NOMSG(isa<PointerType>(MatrixPtr->getType()));
+  Type *MatrixType = MatrixPtr->getType()->getPointerElementType();
+
+  Value *Arr = CI->getArgOperand(2);
+  Value *Offset = CI->getArgOperand(3);
+  Value *Stride = CI->getArgOperand(4);
+  Value *Layout = CI->getArgOperand(5);
+
+  Constant *OpArg = HlslOp->GetU32Const((unsigned)OpCode);
+  Function *DxilFunc = HlslOp->GetOpFunc(OpCode, {MatrixType, Arr->getType()});
+
+  Value *Matrix =
+      Builder.CreateCall(DxilFunc, {OpArg, Arr, Offset, Stride, Layout});
+  Builder.CreateStore(Matrix, MatrixPtr);
+
+  return nullptr;
+}
+
+Value *TranslateLinAlgMatrixAccumStoreToMemory(
+    CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode,
+    HLOperationLowerHelper &Helper, HLObjectOperationLowerHelper *ObjHelper,
+    bool &Translated) {
+  hlsl::OP *HlslOp = &Helper.hlslOP;
+  IRBuilder<> Builder(CI);
+
+  Value *Matrix = CI->getArgOperand(1);
+  Value *Arr = CI->getArgOperand(2);
+  Value *Offset = CI->getArgOperand(3);
+  Value *Stride = CI->getArgOperand(4);
+  Value *Layout = CI->getArgOperand(5);
+
+  Constant *OpArg = HlslOp->GetU32Const((unsigned)OpCode);
+  Function *DxilFunc =
+      HlslOp->GetOpFunc(OpCode, {Matrix->getType(), Arr->getType()});
+
+  return Builder.CreateCall(DxilFunc,
+                            {OpArg, Matrix, Arr, Offset, Stride, Layout});
+}
+
 } // namespace
 
 // Lower table.
@@ -7989,14 +8036,16 @@ constexpr IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP___builtin_LinAlg_MatrixLoadFromDescriptor,
      TranslateLinAlgMatrixLoadFromDescriptor,
      DXIL::OpCode::LinAlgMatrixLoadFromDescriptor},
-    {IntrinsicOp::IOP___builtin_LinAlg_MatrixLoadFromMemory, EmptyLower,
+    {IntrinsicOp::IOP___builtin_LinAlg_MatrixLoadFromMemory,
+     TranslateLinAlgMatrixLoadFromMemory,
      DXIL::OpCode::LinAlgMatrixLoadFromMemory},
     {IntrinsicOp::IOP___builtin_LinAlg_MatrixSetElement,
      TranslateLinAlgMatrixSetElement, DXIL::OpCode::LinAlgMatrixSetElement},
     {IntrinsicOp::IOP___builtin_LinAlg_MatrixStoreToDescriptor,
      TranslateLinAlgMatrixAccumStoreToDescriptor,
      DXIL::OpCode::LinAlgMatrixStoreToDescriptor},
-    {IntrinsicOp::IOP___builtin_LinAlg_MatrixStoreToMemory, EmptyLower,
+    {IntrinsicOp::IOP___builtin_LinAlg_MatrixStoreToMemory,
+     TranslateLinAlgMatrixAccumStoreToMemory,
      DXIL::OpCode::LinAlgMatrixStoreToMemory},
     {IntrinsicOp::IOP___builtin_LinAlg_MatrixAccumulate,
      TranslateLinAlgMatrixAccumulate, DXIL::OpCode::LinAlgMatrixAccumulate},
@@ -8010,7 +8059,8 @@ constexpr IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP___builtin_LinAlg_MatrixAccumulateToDescriptor,
      TranslateLinAlgMatrixAccumStoreToDescriptor,
      DXIL::OpCode::LinAlgMatrixAccumulateToDescriptor},
-    {IntrinsicOp::IOP___builtin_LinAlg_MatrixAccumulateToMemory, EmptyLower,
+    {IntrinsicOp::IOP___builtin_LinAlg_MatrixAccumulateToMemory,
+     TranslateLinAlgMatrixAccumStoreToMemory,
      DXIL::OpCode::LinAlgMatrixAccumulateToMemory},
     {IntrinsicOp::IOP___builtin_LinAlg_MatrixOuterProduct,
      TranslateLinAlgMatrixOuterProduct, DXIL::OpCode::LinAlgMatrixOuterProduct},
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
new file mode 100644
index 0000000000..5461600016
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
@@ -0,0 +1,19 @@
+// REQUIRES: dxil-1-10
+// RUN: %dxc -T cs_6_10 -HV 202x -E main %s | FileCheck %s
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64]) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixAccumulateToMemory(mat, Arr, 0, 0, 0);
+}
+
+// CHECK: @{{.*}} = external addrspace(3) global [64 x float]
+
+[numthreads(4,1,1)]
+void main() {
+  // CHECK-LABEL: define void @main()
+
+  // CHECK: call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U1S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  fn(SharedArr);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
new file mode 100644
index 0000000000..a5dd722f1b
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
@@ -0,0 +1,19 @@
+// REQUIRES: dxil-1-10
+// RUN: %dxc -T cs_6_10 -HV 202x -E main %s | FileCheck %s
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64]) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixLoadFromMemory(mat, Arr, 0, 0, 0);
+}
+
+// CHECK: @{{.*}} = external addrspace(3) global [64 x float]
+
+[numthreads(4,1,1)]
+void main() {
+  // CHECK-LABEL: define void @main()
+
+  // CHECK: call %dx.types.LinAlgMatrixC4M5N4U1S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U1S2.f32(i32 -2147483633, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  fn(SharedArr);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
new file mode 100644
index 0000000000..f6c38536a3
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
@@ -0,0 +1,19 @@
+// REQUIRES: dxil-1-10
+// RUN: %dxc -T cs_6_10 -HV 202x -E main %s | FileCheck %s
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64]) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixStoreToMemory(mat, Arr, 0, 0, 0);
+}
+
+// CHECK: @{{.*}} = external addrspace(3) global [64 x float]
+
+[numthreads(4,1,1)]
+void main() {
+  // CHECK-LABEL: define void @main()
+
+  // CHECK: call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U1S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
+  fn(SharedArr);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl
new file mode 100644
index 0000000000..e3694e1eb4
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl
@@ -0,0 +1,24 @@
+// REQUIRES: dxil-1-10
+// RUN: %dxc -T lib_6_10 -E main %s -ast-dump-implicit | FileCheck %s
+
+// CHECK: FunctionDecl {{.*}} implicit used __builtin_LinAlg_MatrixAccumulateToMemory 'void (__builtin_LinAlgMatrix {{.*}}, float const __attribute__((address_space(3))) (&)[64], unsigned int, unsigned int, unsigned int)' extern
+// CHECK-NEXT: ParmVarDecl {{.*}} matrix '__builtin_LinAlgMatrix {{.*}}'
+// CHECK-NEXT: ParmVarDecl {{.*}} memory 'float const __attribute__((address_space(3))) (&)[64]'
+// CHECK-NEXT: ParmVarDecl {{.*}} offset 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} stride 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} layout 'unsigned int'
+// CHECK-NEXT: HLSLIntrinsicAttr {{.*}} Implicit "op" "" 420
+// CHECK-NEXT: AvailabilityAttr {{.*}} Implicit  6.10 0 0 ""
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64]) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixAccumulateToMemory(mat, Arr, 0, 0, 0);
+}
+
+[shader("compute")]
+[numthreads(1,1,1)]
+void main() {
+  fn(SharedArr);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl
new file mode 100644
index 0000000000..8048e22922
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -T cs_6_9 -HV 202x -E main %s -verify
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64], float F) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+
+  // expected-error@+1{{intrinsic __builtin_LinAlg_MatrixAccumulateToMemory potentially used by ''main'' requires shader model 6.10 or greater}}
+  __builtin_LinAlg_MatrixAccumulateToMemory(mat, Arr, 0, 0, 0);
+}
+
+[numthreads(4,1,1)]
+void main() {
+  fn(SharedArr, 6.0);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl
new file mode 100644
index 0000000000..2874ba3c37
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl
@@ -0,0 +1,24 @@
+// REQUIRES: dxil-1-10
+// RUN: %dxc -T lib_6_10 -E main %s -ast-dump-implicit | FileCheck %s
+
+// CHECK: FunctionDecl {{.*}} implicit used __builtin_LinAlg_MatrixLoadFromMemory 'void (__builtin_LinAlgMatrix {{.*}}, float const __attribute__((address_space(3))) (&)[64], unsigned int, unsigned int, unsigned int)' extern
+// CHECK-NEXT: ParmVarDecl {{.*}} ret '__builtin_LinAlgMatrix {{.*}}'
+// CHECK-NEXT: ParmVarDecl {{.*}} memory 'float const __attribute__((address_space(3))) (&)[64]'
+// CHECK-NEXT: ParmVarDecl {{.*}} offset 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} stride 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} layout 'unsigned int'
+// CHECK-NEXT: HLSLIntrinsicAttr {{.*}} Implicit "op" "" 411
+// CHECK-NEXT: AvailabilityAttr {{.*}} Implicit  6.10 0 0 ""
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64]) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixLoadFromMemory(mat, Arr, 0, 0, 0);
+}
+
+[shader("compute")]
+[numthreads(1,1,1)]
+void main() {
+  fn(SharedArr);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl
new file mode 100644
index 0000000000..af3dd3b846
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -T cs_6_9 -HV 202x -E main %s -verify
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64], float F) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+
+  // expected-error@+1{{intrinsic __builtin_LinAlg_MatrixLoadFromMemory potentially used by ''main'' requires shader model 6.10 or greater}}
+  __builtin_LinAlg_MatrixLoadFromMemory(mat, Arr, 0, 0, 0);
+}
+
+[numthreads(4,1,1)]
+void main() {
+  fn(SharedArr, 6.0);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl
new file mode 100644
index 0000000000..1c2520fe6c
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl
@@ -0,0 +1,24 @@
+// REQUIRES: dxil-1-10
+// RUN: %dxc -T lib_6_10 -E main %s -ast-dump-implicit | FileCheck %s
+
+// CHECK: FunctionDecl {{.*}} implicit used __builtin_LinAlg_MatrixStoreToMemory 'void (__builtin_LinAlgMatrix {{.*}}, float const __attribute__((address_space(3))) (&)[64], unsigned int, unsigned int, unsigned int)' extern
+// CHECK-NEXT: ParmVarDecl {{.*}} matrix '__builtin_LinAlgMatrix {{.*}}'
+// CHECK-NEXT: ParmVarDecl {{.*}} memory 'float const __attribute__((address_space(3))) (&)[64]'
+// CHECK-NEXT: ParmVarDecl {{.*}} offset 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} stride 'unsigned int'
+// CHECK-NEXT: ParmVarDecl {{.*}} layout 'unsigned int'
+// CHECK-NEXT: HLSLIntrinsicAttr {{.*}} Implicit "op" "" 414
+// CHECK-NEXT: AvailabilityAttr {{.*}} Implicit  6.10 0 0 ""
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64]) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixStoreToMemory(mat, Arr, 0, 0, 0);
+}
+
+[shader("compute")]
+[numthreads(1,1,1)]
+void main() {
+  fn(SharedArr);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl
new file mode 100644
index 0000000000..934963f5dc
--- /dev/null
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl
@@ -0,0 +1,15 @@
+// RUN: %dxc -T cs_6_9 -HV 202x -E main %s -verify
+
+groupshared float SharedArr[64];
+
+void fn(groupshared float Arr[64], float F) {
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+
+  // expected-error@+1{{intrinsic __builtin_LinAlg_MatrixStoreToMemory potentially used by ''main'' requires shader model 6.10 or greater}}
+  __builtin_LinAlg_MatrixStoreToMemory(mat, Arr, 0, 0, 0);
+}
+
+[numthreads(4,1,1)]
+void main() {
+  fn(SharedArr, 6.0);
+}
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/stage-errors.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/stage-errors.hlsl
index fbec113e81..c9ebd7adf8 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/stage-errors.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/stage-errors.hlsl
@@ -8,8 +8,12 @@
 // RUN: %dxc -T lib_6_10 -DMATRIX_STORE_TO_DESCRIPTOR %s -verify
 // RUN: %dxc -T lib_6_10 -DMATRIX_LENGTH %s -verify
 // RUN: %dxc -T lib_6_10 -DMATRIX_ACCUMULATE %s -verify
+// RUN: %dxc -T lib_6_10 -DMATRIX_LOAD_FROM_MEMORY %s -verify
+// RUN: %dxc -T lib_6_10 -DMATRIX_STORE_TO_MEMORY %s -verify
+// RUN: %dxc -T lib_6_10 -DMATRIX_ACCUMULATE_TO_MEMORY %s -verify
 
 RWByteAddressBuffer buf;
+groupshared float gs_arr[64];
 
 void CallFunction()
 {
@@ -62,6 +66,18 @@ void CallFunction()
   #define DO_FUNC __builtin_LinAlg_MatrixAccumulate(mat1, mat2, mat3);
 #endif
 
+#ifdef MATRIX_LOAD_FROM_MEMORY
+  #define DO_FUNC __builtin_LinAlg_MatrixLoadFromMemory(mat1, gs_arr, 0, 0, 0);
+#endif
+
+#ifdef MATRIX_STORE_TO_MEMORY
+  #define DO_FUNC __builtin_LinAlg_MatrixStoreToMemory(mat1, gs_arr, 0, 0, 0);
+#endif
+
+#ifdef MATRIX_ACCUMULATE_TO_MEMORY
+  #define DO_FUNC __builtin_LinAlg_MatrixAccumulateToMemory(mat1, gs_arr, 0, 0, 0);
+#endif
+
   // The builtins below are allowed in all stages, if they raise an error
   // then the test will fail with "saw unexpected diagnostic"
   uint layout = __builtin_LinAlg_MatrixQueryAccumulatorLayout();
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index 49aa2f151b..4810442a7e 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -402,13 +402,13 @@ void [[min_sm=6.10]] __builtin_LinAlg_FillMatrix(out LinAlgMatrix ret, in numeri
 void [[min_sm=6.10]] __builtin_LinAlg_CopyConvertMatrix(out LinAlgMatrix ret, in LinAlgMatrix source, in bool transpose);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixLoadFromDescriptor(out LinAlgMatrix ret, in ByteAddressBuffer buf, in uint offset, in uint stride, in uint layout);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixLoadFromDescriptor(out LinAlgMatrix ret, in RWByteAddressBuffer buf, in uint offset, in uint stride, in uint layout);
-void [[min_sm=6.10]] __builtin_LinAlg_MatrixLoadFromMemory(out LinAlgMatrix ret, in int GroupSharedMem, in uint offset, in uint stride, in uint layout);
+void [[min_sm=6.10]] __builtin_LinAlg_MatrixLoadFromMemory(out LinAlgMatrix ret, groupshared numeric[] memory, in uint offset, in uint stride, in uint layout);
 uint [[min_sm=6.10]] __builtin_LinAlg_MatrixLength(in LinAlgMatrix matrix);
 uint<2> [[min_sm=6.10]] __builtin_LinAlg_MatrixGetCoordinate(in LinAlgMatrix matrix, in uint threadLocalIndex);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixGetElement(out numeric ret, in LinAlgMatrix matrix, in uint threadLocalIndex);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixSetElement(out LinAlgMatrix ret, in LinAlgMatrix matrix, in uint threadLocalIndex, in numeric value);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixStoreToDescriptor(in LinAlgMatrix matrix, in RWByteAddressBuffer buf, in uint offset, in uint stride, in uint layout);
-void [[min_sm=6.10]] __builtin_LinAlg_MatrixStoreToMemory(in LinAlgMatrix matrix, in int GroupSharedMem, in uint offset, in uint stride, in uint layout);
+void [[min_sm=6.10]] __builtin_LinAlg_MatrixStoreToMemory(in LinAlgMatrix matrix, groupshared numeric[] memory, in uint offset, in uint stride, in uint layout);
 uint [[min_sm=6.10]] __builtin_LinAlg_MatrixQueryAccumulatorLayout();
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixMatrixMultiply(out LinAlgMatrix matrixC, in LinAlgMatrix matrixA, in LinAlgMatrix matrixB);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixMatrixMultiplyAccumulate(out LinAlgMatrix matrixR, in LinAlgMatrix matrixA, in LinAlgMatrix matrixB, in LinAlgMatrix matrixC);
@@ -416,7 +416,7 @@ void [[min_sm=6.10]] __builtin_LinAlg_MatrixAccumulate(out LinAlgMatrix matrixC,
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixVectorMultiply(out numeric<> ret, in LinAlgMatrix mat, in numeric<> input, in uint input_interp);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixVectorMultiplyAdd(out numeric<> ret, in LinAlgMatrix mat, in numeric<> input, in uint input_interp, in numeric<> bias, in uint bias_interp);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixAccumulateToDescriptor(in LinAlgMatrix matrix, in RWByteAddressBuffer buf, in uint offset, in uint stride, in uint layout);
-void [[min_sm=6.10]] __builtin_LinAlg_MatrixAccumulateToMemory(in LinAlgMatrix matrix, in int GroupSharedMem, in uint offset, in uint stride, in uint layout);
+void [[min_sm=6.10]] __builtin_LinAlg_MatrixAccumulateToMemory(in LinAlgMatrix matrix, groupshared numeric[] memory, in uint offset, in uint stride, in uint layout);
 void [[min_sm=6.10]] __builtin_LinAlg_MatrixOuterProduct(out LinAlgMatrix ret, in numeric<> vecA, in numeric<> vecB);
 
 } namespace
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 71f035e059..5dbb59102f 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -52,9 +52,10 @@
 # - "," is used to separate multiple overload dimensions.
 #   - When used, only $x0, $x1, etc. are supported for overloaded parameter
 #     types.
+# - "a" is for any array ([n x Ty])
 # dxil_all_user_oload_chars must be kept in sync with the indices in
 # hlsl::OP::TypeSlot in DxilOperations.h.
-dxil_all_user_oload_chars = "hfd18wiluo<"
+dxil_all_user_oload_chars = "hfd18wiluo<a"
 dxil_scalar_oload_chars = "hfd18wil"
 
 # Maximum number of overload dimensions supported through the extended overload
@@ -6406,13 +6407,12 @@ def populate_ExperimentalOps(self):
             "LinAlgMatrixLoadFromMemory",
             "LinAlgMatrixLoadFromMemory",
             "fills a matrix with data from a groupshared array",
-            "o,hfwi",  # TODO: needs to be updated for groupshared
+            "o,a",
             "",
             [
                 db_dxil_param(0, "$x0", "", "resulting matrix"),
-                # TODO: [Ty] * addrspace(4),   ; groupshared T[M * N]
                 db_dxil_param(
-                    2, "$x1", "groupsharedArr", "groupshared array to fill matrix with"
+                    2, "$x1", "memory", "groupshared array to fill matrix with"
                 ),
                 db_dxil_param(3, "i32", "offset", "starting offset in the array"),
                 db_dxil_param(
@@ -6508,14 +6508,13 @@ def populate_ExperimentalOps(self):
             "LinAlgMatrixStoreToMemory",
             "LinAlgMatrixStoreToMemory",
             "stores a matrix to groupshared memory",
-            "o,hfwi",  # TODO: needs to be updated for groupshared
+            "o,a",
             "",
             [
                 db_dxil_param(0, "v", "", ""),
                 db_dxil_param(2, "$x0", "matrix", "matrix to be stored"),
-                # TODO: [Ty] * addrspace(4),   ; groupshared T[M * N]
                 db_dxil_param(
-                    3, "$x1", "groupsharedArr", "groupshared array to store into"
+                    3, "$x1", "memory", "groupshared array to store into"
                 ),
                 db_dxil_param(4, "i32", "offset", "starting offset in the array"),
                 db_dxil_param(
@@ -6626,14 +6625,13 @@ def populate_ExperimentalOps(self):
             "LinAlgMatrixAccumulateToMemory",
             "LinAlgMatrixAccumulateToMemory",
             "accumulates a matrix to groupshared memory",
-            "o,hfwi",  # TODO: needs to be updated for groupshared
+            "o,a",
             "",
             [
                 db_dxil_param(0, "v", "", ""),
                 db_dxil_param(2, "$x0", "matrix", "Accumulator matrix"),
-                # TODO: [Ty] * addrspace(4),   ; groupshared T[M * N]
                 db_dxil_param(
-                    3, "$x1", "groupsharedArr", "groupshared array to accumulate into"
+                    3, "$x1", "memory", "groupshared array to accumulate into"
                 ),
                 db_dxil_param(4, "i32", "offset", "starting offset in the array"),
                 db_dxil_param(
@@ -9599,7 +9597,7 @@ def __init__(self, intrinsic_defs, opcode_data):
             "out": "AR_QUAL_OUT",
             "col_major": "AR_QUAL_COLMAJOR",
             "row_major": "AR_QUAL_ROWMAJOR",
-            "groupshared": "AR_QUAL_GROUPSHARED",
+            "groupshared": "AR_QUAL_IN | AR_QUAL_GROUPSHARED",
         }
         self.intrinsics = []
         self.load_intrinsics(intrinsic_defs)

From 28f9487e9d27fc85b926f520e2189c3d03dc529d Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Tue, 10 Mar 2026 15:32:04 -0600
Subject: [PATCH 2/7] Address comments

---
 .../builtins/matrixaccumulatetomemory/nominal.hlsl  | 13 ++++---------
 .../builtins/matrixloadfrommemory/nominal.hlsl      | 13 ++++---------
 .../builtins/matrixstoretomemory/nominal.hlsl       | 13 ++++---------
 .../builtins/matrixaccumulatetomemory/ast.hlsl      |  8 ++------
 .../unavailable_pre_sm610.hlsl                      | 10 +++-------
 .../linalg/builtins/matrixloadfrommemory/ast.hlsl   |  8 ++------
 .../matrixloadfrommemory/unavailable_pre_sm610.hlsl | 10 +++-------
 .../linalg/builtins/matrixstoretomemory/ast.hlsl    |  8 ++------
 .../matrixstoretomemory/unavailable_pre_sm610.hlsl  | 10 +++-------
 9 files changed, 27 insertions(+), 66 deletions(-)

diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
index 5461600016..cfdac39028 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
@@ -1,19 +1,14 @@
 // REQUIRES: dxil-1-10
 // RUN: %dxc -T cs_6_10 -HV 202x -E main %s | FileCheck %s
 
-groupshared float SharedArr[64];
-
-void fn(groupshared float Arr[64]) {
-  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
-  __builtin_LinAlg_MatrixAccumulateToMemory(mat, Arr, 0, 0, 0);
-}
-
 // CHECK: @{{.*}} = external addrspace(3) global [64 x float]
+groupshared float SharedArr[64];
 
 [numthreads(4,1,1)]
 void main() {
   // CHECK-LABEL: define void @main()
 
-  // CHECK: call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U1S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
-  fn(SharedArr);
+  // CHECK: call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U1S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixAccumulateToMemory(mat, SharedArr, 1, 2, 3);
 }
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
index a5dd722f1b..a3e383ca58 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
@@ -1,19 +1,14 @@
 // REQUIRES: dxil-1-10
 // RUN: %dxc -T cs_6_10 -HV 202x -E main %s | FileCheck %s
 
-groupshared float SharedArr[64];
-
-void fn(groupshared float Arr[64]) {
-  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
-  __builtin_LinAlg_MatrixLoadFromMemory(mat, Arr, 0, 0, 0);
-}
-
 // CHECK: @{{.*}} = external addrspace(3) global [64 x float]
+groupshared float SharedArr[64];
 
 [numthreads(4,1,1)]
 void main() {
   // CHECK-LABEL: define void @main()
 
-  // CHECK: call %dx.types.LinAlgMatrixC4M5N4U1S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U1S2.f32(i32 -2147483633, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
-  fn(SharedArr);
+  // CHECK: call %dx.types.LinAlgMatrixC4M5N4U1S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U1S2.f32(i32 -2147483633, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixLoadFromMemory(mat, SharedArr, 1, 2, 3);
 }
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
index f6c38536a3..4b5b50c357 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
@@ -1,19 +1,14 @@
 // REQUIRES: dxil-1-10
 // RUN: %dxc -T cs_6_10 -HV 202x -E main %s | FileCheck %s
 
-groupshared float SharedArr[64];
-
-void fn(groupshared float Arr[64]) {
-  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
-  __builtin_LinAlg_MatrixStoreToMemory(mat, Arr, 0, 0, 0);
-}
-
 // CHECK: @{{.*}} = external addrspace(3) global [64 x float]
+groupshared float SharedArr[64];
 
 [numthreads(4,1,1)]
 void main() {
   // CHECK-LABEL: define void @main()
 
-  // CHECK: call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U1S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
-  fn(SharedArr);
+  // CHECK: call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U1S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixStoreToMemory(mat, SharedArr, 1, 2, 3);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl
index e3694e1eb4..d300796b67 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/ast.hlsl
@@ -12,13 +12,9 @@
 
 groupshared float SharedArr[64];
 
-void fn(groupshared float Arr[64]) {
-  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
-  __builtin_LinAlg_MatrixAccumulateToMemory(mat, Arr, 0, 0, 0);
-}
-
 [shader("compute")]
 [numthreads(1,1,1)]
 void main() {
-  fn(SharedArr);
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixAccumulateToMemory(mat, SharedArr, 0, 0, 0);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl
index 8048e22922..e5a9ea4895 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixaccumulatetomemory/unavailable_pre_sm610.hlsl
@@ -2,14 +2,10 @@
 
 groupshared float SharedArr[64];
 
-void fn(groupshared float Arr[64], float F) {
+[numthreads(4,1,1)]
+void main() {
   __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
 
   // expected-error@+1{{intrinsic __builtin_LinAlg_MatrixAccumulateToMemory potentially used by ''main'' requires shader model 6.10 or greater}}
-  __builtin_LinAlg_MatrixAccumulateToMemory(mat, Arr, 0, 0, 0);
-}
-
-[numthreads(4,1,1)]
-void main() {
-  fn(SharedArr, 6.0);
+  __builtin_LinAlg_MatrixAccumulateToMemory(mat, SharedArr, 0, 0, 0);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl
index 2874ba3c37..3ac0de3880 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/ast.hlsl
@@ -12,13 +12,9 @@
 
 groupshared float SharedArr[64];
 
-void fn(groupshared float Arr[64]) {
-  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
-  __builtin_LinAlg_MatrixLoadFromMemory(mat, Arr, 0, 0, 0);
-}
-
 [shader("compute")]
 [numthreads(1,1,1)]
 void main() {
-  fn(SharedArr);
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixLoadFromMemory(mat, SharedArr, 0, 0, 0);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl
index af3dd3b846..d8472ad92b 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixloadfrommemory/unavailable_pre_sm610.hlsl
@@ -2,14 +2,10 @@
 
 groupshared float SharedArr[64];
 
-void fn(groupshared float Arr[64], float F) {
+[numthreads(4,1,1)]
+void main() {
   __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
 
   // expected-error@+1{{intrinsic __builtin_LinAlg_MatrixLoadFromMemory potentially used by ''main'' requires shader model 6.10 or greater}}
-  __builtin_LinAlg_MatrixLoadFromMemory(mat, Arr, 0, 0, 0);
-}
-
-[numthreads(4,1,1)]
-void main() {
-  fn(SharedArr, 6.0);
+  __builtin_LinAlg_MatrixLoadFromMemory(mat, SharedArr, 0, 0, 0);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl
index 1c2520fe6c..c726d119eb 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/ast.hlsl
@@ -12,13 +12,9 @@
 
 groupshared float SharedArr[64];
 
-void fn(groupshared float Arr[64]) {
-  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
-  __builtin_LinAlg_MatrixStoreToMemory(mat, Arr, 0, 0, 0);
-}
-
 [shader("compute")]
 [numthreads(1,1,1)]
 void main() {
-  fn(SharedArr);
+  __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
+  __builtin_LinAlg_MatrixStoreToMemory(mat, SharedArr, 0, 0, 0);
 }
diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl
index 934963f5dc..d3468a2a02 100644
--- a/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl
+++ b/tools/clang/test/SemaHLSL/hlsl/linalg/builtins/matrixstoretomemory/unavailable_pre_sm610.hlsl
@@ -2,14 +2,10 @@
 
 groupshared float SharedArr[64];
 
-void fn(groupshared float Arr[64], float F) {
+[numthreads(4,1,1)]
+void main() {
   __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
 
   // expected-error@+1{{intrinsic __builtin_LinAlg_MatrixStoreToMemory potentially used by ''main'' requires shader model 6.10 or greater}}
-  __builtin_LinAlg_MatrixStoreToMemory(mat, Arr, 0, 0, 0);
-}
-
-[numthreads(4,1,1)]
-void main() {
-  fn(SharedArr, 6.0);
+  __builtin_LinAlg_MatrixStoreToMemory(mat, SharedArr, 0, 0, 0);
 }

From 6ff022c0aa01b25464d11cc8139e96900135dcb2 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Wed, 11 Mar 2026 19:34:20 -0600
Subject: [PATCH 3/7] Rework based on feedback

---
 include/dxc/DXIL/DxilOperations.h             |   1 -
 lib/DXIL/DxilOperations.cpp                   | 123 +++++++++---------
 lib/HLSL/HLOperationLower.cpp                 |  17 ++-
 .../matrixaccumulatetomemory/nominal.hlsl     |   2 +-
 .../matrixloadfrommemory/nominal.hlsl         |   2 +-
 .../builtins/matrixstoretomemory/nominal.hlsl |   2 +-
 utils/hct/hctdb.py                            |  22 ++--
 utils/hct/hctdb_instrhelp.py                  |  39 ++++--
 8 files changed, 117 insertions(+), 91 deletions(-)

diff --git a/include/dxc/DXIL/DxilOperations.h b/include/dxc/DXIL/DxilOperations.h
index 85df375b3a..bab4bffc6e 100644
--- a/include/dxc/DXIL/DxilOperations.h
+++ b/include/dxc/DXIL/DxilOperations.h
@@ -212,7 +212,6 @@ class OP {
     TS_UDT = 8,      // Ex: %"struct.MyStruct" *
     TS_Object = 9,   // Ex: %"class.StructuredBuffer<Foo>"
     TS_Vector = 10,  // Ex: <8 x i16>
-    TS_Array = 11,   // Ex: [8 x float]
     TS_MaskBitCount, // Types used in Mask end here
     // TS_Extended is only used to identify the unnamed struct type used to wrap
     // multiple overloads when using GetTypeSlot.
diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index 02dcfe65a0..ffff4eccd9 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -2863,8 +2863,8 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      "linAlgMatrixLoadFromMemory",
      Attribute::None,
      2,
-     {{0x200}, {0x800}},
-     {{0x0}, {0x0}}}, // Overloads: o,a
+     {{0x200}, {0x63}},
+     {{0x0}, {0x0}}}, // Overloads: o,hfwi
     {OC::LinAlgMatrixLength,
      "LinAlgMatrixLength",
      OCC::LinAlgMatrixLength,
@@ -2911,8 +2911,8 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      "linAlgMatrixStoreToMemory",
      Attribute::None,
      2,
-     {{0x200}, {0x800}},
-     {{0x0}, {0x0}}}, // Overloads: o,a
+     {{0x200}, {0x63}},
+     {{0x0}, {0x0}}}, // Overloads: o,hfwi
     {OC::LinAlgMatrixQueryAccumulatorLayout,
      "LinAlgMatrixQueryAccumulatorLayout",
      OCC::LinAlgMatrixQueryAccumulatorLayout,
@@ -2967,8 +2967,8 @@ static const OP::OpCodeProperty ExperimentalOps_OpCodeProps[] = {
      "linAlgMatrixAccumulateToMemory",
      Attribute::None,
      2,
-     {{0x200}, {0x800}},
-     {{0x0}, {0x0}}}, // Overloads: o,a
+     {{0x200}, {0x63}},
+     {{0x0}, {0x0}}}, // Overloads: o,hfwi
     {OC::LinAlgMatrixOuterProduct,
      "LinAlgMatrixOuterProduct",
      OCC::LinAlgMatrixOuterProduct,
@@ -3152,8 +3152,6 @@ unsigned OP::GetTypeSlot(Type *pType) {
       return TS_Extended;
   case Type::VectorTyID:
     return TS_Vector;
-  case Type::ArrayTyID:
-    return TS_Array;
   default:
     break;
   }
@@ -3194,12 +3192,6 @@ StringRef OP::GetTypeName(Type *Ty, SmallVectorImpl<char> &Storage) {
                 GetOverloadTypeName(OP::GetTypeSlot(VecTy->getElementType()))))
         .toStringRef(Storage);
   }
-  case TS_Array: {
-    if (Ty->isPointerTy())
-      Ty = Ty->getPointerElementType();
-    ArrayType *ArrTy = cast<ArrayType>(Ty);
-    return GetOverloadTypeName(OP::GetTypeSlot(ArrTy->getArrayElementType()));
-  }
   case TS_Extended: {
     DXASSERT(isa<StructType>(Ty),
              "otherwise, extended overload type not wrapped in struct type.");
@@ -4332,9 +4324,10 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
 #define VEC2(_y) A(VectorType::get(_y, 2))
 #define VEC4(_y) A(GetStructVectorType(4, _y))
 #define VEC9(_y) A(VectorType::get(_y, 9))
+#define TGSM(_y) A(PointerType::get(_y, DXIL::kTGSMAddrSpace))
 
 // Extended Overload types are wrapped in an anonymous struct
-#define EXT(_y) A(cast<StructType>(pOverloadType)->getElementType(_y))
+#define EXT(_y) cast<StructType>(pOverloadType)->getElementType(_y)
 
   /* <py::lines('OPCODE-OLOAD-FUNCS')>hctdb_instrhelp.get_oloads_funcs()</py>*/
   switch (opCode) { // return     opCode
@@ -6445,9 +6438,9 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
 
     // Linear Algebra Operations
   case OpCode::MatVecMul:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
+    A(EXT(1));
     A(pI1);
     A(pI32);
     A(pRes);
@@ -6461,9 +6454,9 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     A(pI1);
     break;
   case OpCode::MatVecMulAdd:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
+    A(EXT(1));
     A(pI1);
     A(pI32);
     A(pRes);
@@ -6482,8 +6475,8 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   case OpCode::OuterProductAccumulate:
     A(pV);
     A(pI32);
-    EXT(0);
-    EXT(1);
+    A(EXT(0));
+    A(EXT(1));
     A(pRes);
     A(pI32);
     A(pI32);
@@ -6586,21 +6579,21 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
 
     // Linear Algebra Operations
   case OpCode::LinAlgMatrixMultiplyAccumulate:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
-    EXT(2);
-    EXT(3);
+    A(EXT(1));
+    A(EXT(2));
+    A(EXT(3));
     break;
   case OpCode::LinAlgFillMatrix:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
+    A(EXT(1));
     break;
   case OpCode::LinAlgCopyConvertMatrix:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
+    A(EXT(1));
     A(pI1);
     break;
   case OpCode::LinAlgMatrixLoadFromDescriptor:
@@ -6612,9 +6605,9 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     A(pI32);
     break;
   case OpCode::LinAlgMatrixLoadFromMemory:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
+    TGSM(EXT(1));
     A(pI32);
     A(pI32);
     A(pI32);
@@ -6631,17 +6624,17 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     A(pI32);
     break;
   case OpCode::LinAlgMatrixGetElement:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
+    A(EXT(1));
     A(pI32);
     break;
   case OpCode::LinAlgMatrixSetElement:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
+    A(EXT(1));
     A(pI32);
-    EXT(2);
+    A(EXT(2));
     break;
   case OpCode::LinAlgMatrixStoreToDescriptor:
     A(pV);
@@ -6655,8 +6648,8 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   case OpCode::LinAlgMatrixStoreToMemory:
     A(pV);
     A(pI32);
-    EXT(0);
-    EXT(1);
+    A(EXT(0));
+    TGSM(EXT(1));
     A(pI32);
     A(pI32);
     A(pI32);
@@ -6666,31 +6659,31 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
     A(pI32);
     break;
   case OpCode::LinAlgMatrixMultiply:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
-    EXT(2);
+    A(EXT(1));
+    A(EXT(2));
     break;
   case OpCode::LinAlgMatrixAccumulate:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
-    EXT(2);
+    A(EXT(1));
+    A(EXT(2));
     break;
   case OpCode::LinAlgMatVecMul:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
-    EXT(2);
+    A(EXT(1));
+    A(EXT(2));
     A(pI32);
     break;
   case OpCode::LinAlgMatVecMulAdd:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
-    EXT(2);
+    A(EXT(1));
+    A(EXT(2));
     A(pI32);
-    EXT(3);
+    A(EXT(3));
     A(pI32);
     break;
   case OpCode::LinAlgMatrixAccumulateToDescriptor:
@@ -6705,17 +6698,17 @@ Function *OP::GetOpFunc(OpCode opCode, Type *pOverloadType) {
   case OpCode::LinAlgMatrixAccumulateToMemory:
     A(pV);
     A(pI32);
-    EXT(0);
-    EXT(1);
+    A(EXT(0));
+    TGSM(EXT(1));
     A(pI32);
     A(pI32);
     A(pI32);
     break;
   case OpCode::LinAlgMatrixOuterProduct:
-    EXT(0);
+    A(EXT(0));
     A(pI32);
-    EXT(1);
-    EXT(2);
+    A(EXT(1));
+    A(EXT(2));
     break;
 
     //
@@ -7082,7 +7075,6 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
   case OpCode::MatVecMulAdd:
   case OpCode::LinAlgFillMatrix:
   case OpCode::LinAlgCopyConvertMatrix:
-  case OpCode::LinAlgMatrixLoadFromMemory:
   case OpCode::LinAlgMatrixGetElement:
     if (FT->getNumParams() < 2)
       return nullptr;
@@ -7090,8 +7082,6 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
                                  {FT->getReturnType(), FT->getParamType(1)});
 
   case OpCode::OuterProductAccumulate:
-  case OpCode::LinAlgMatrixStoreToMemory:
-  case OpCode::LinAlgMatrixAccumulateToMemory:
     if (FT->getNumParams() < 3)
       return nullptr;
     return llvm::StructType::get(Ctx,
@@ -7104,12 +7094,27 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) {
                                  {FT->getReturnType(), FT->getParamType(1),
                                   FT->getParamType(2), FT->getParamType(3)});
 
+  case OpCode::LinAlgMatrixLoadFromMemory:
+    if (FT->getNumParams() < 2)
+      return nullptr;
+    return llvm::StructType::get(
+        Ctx,
+        {FT->getReturnType(), FT->getParamType(1)->getPointerElementType()});
+
   case OpCode::LinAlgMatrixSetElement:
     if (FT->getNumParams() < 4)
       return nullptr;
     return llvm::StructType::get(
         Ctx, {FT->getReturnType(), FT->getParamType(1), FT->getParamType(3)});
 
+  case OpCode::LinAlgMatrixStoreToMemory:
+  case OpCode::LinAlgMatrixAccumulateToMemory:
+    if (FT->getNumParams() < 3)
+      return nullptr;
+    return llvm::StructType::get(
+        Ctx,
+        {FT->getParamType(1), FT->getParamType(2)->getPointerElementType()});
+
   case OpCode::LinAlgMatrixMultiply:
   case OpCode::LinAlgMatrixAccumulate:
   case OpCode::LinAlgMatVecMul:
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 9ea6166f36..6d718257d4 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -7242,11 +7242,15 @@ Value *TranslateLinAlgMatrixLoadFromMemory(
   Value *Stride = CI->getArgOperand(4);
   Value *Layout = CI->getArgOperand(5);
 
+  Value *Zero = Builder.getInt32(0);
+  Value *ArrPtr = Builder.CreateGEP(Arr, {Zero, Zero});
+  Type *ArrEltTy = ArrPtr->getType()->getPointerElementType();
+
   Constant *OpArg = HlslOp->GetU32Const((unsigned)OpCode);
-  Function *DxilFunc = HlslOp->GetOpFunc(OpCode, {MatrixType, Arr->getType()});
+  Function *DxilFunc = HlslOp->GetOpFunc(OpCode, {MatrixType, ArrEltTy});
 
   Value *Matrix =
-      Builder.CreateCall(DxilFunc, {OpArg, Arr, Offset, Stride, Layout});
+      Builder.CreateCall(DxilFunc, {OpArg, ArrPtr, Offset, Stride, Layout});
   Builder.CreateStore(Matrix, MatrixPtr);
 
   return nullptr;
@@ -7265,12 +7269,15 @@ Value *TranslateLinAlgMatrixAccumStoreToMemory(
   Value *Stride = CI->getArgOperand(4);
   Value *Layout = CI->getArgOperand(5);
 
+  Value *Zero = Builder.getInt32(0);
+  Value *ArrPtr = Builder.CreateGEP(Arr, {Zero, Zero});
+  Type *ArrEltTy = ArrPtr->getType()->getPointerElementType();
+
   Constant *OpArg = HlslOp->GetU32Const((unsigned)OpCode);
-  Function *DxilFunc =
-      HlslOp->GetOpFunc(OpCode, {Matrix->getType(), Arr->getType()});
+  Function *DxilFunc = HlslOp->GetOpFunc(OpCode, {Matrix->getType(), ArrEltTy});
 
   return Builder.CreateCall(DxilFunc,
-                            {OpArg, Matrix, Arr, Offset, Stride, Layout});
+                            {OpArg, Matrix, ArrPtr, Offset, Stride, Layout});
 }
 
 } // namespace
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
index cfdac39028..f05366d62f 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixaccumulatetomemory/nominal.hlsl
@@ -8,7 +8,7 @@ groupshared float SharedArr[64];
 void main() {
   // CHECK-LABEL: define void @main()
 
-  // CHECK: call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U1S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  // CHECK: call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U1S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, float addrspace(3)* getelementptr {{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
   __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
   __builtin_LinAlg_MatrixAccumulateToMemory(mat, SharedArr, 1, 2, 3);
 }
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
index a3e383ca58..9c1e8303b2 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixloadfrommemory/nominal.hlsl
@@ -8,7 +8,7 @@ groupshared float SharedArr[64];
 void main() {
   // CHECK-LABEL: define void @main()
 
-  // CHECK: call %dx.types.LinAlgMatrixC4M5N4U1S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U1S2.f32(i32 -2147483633, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  // CHECK: call %dx.types.LinAlgMatrixC4M5N4U1S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U1S2.f32(i32 -2147483633, float addrspace(3)* getelementptr {{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
   __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
   __builtin_LinAlg_MatrixLoadFromMemory(mat, SharedArr, 1, 2, 3);
 }
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
index 4b5b50c357..07a4fa38e5 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/linalg/builtins/matrixstoretomemory/nominal.hlsl
@@ -8,7 +8,7 @@ groupshared float SharedArr[64];
 void main() {
   // CHECK-LABEL: define void @main()
 
-  // CHECK: call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U1S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, [64 x float] addrspace(3)* nonnull @{{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
+  // CHECK: call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U1S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U1S2 {{.*}}, float addrspace(3)* getelementptr {{.*}}, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   __builtin_LinAlgMatrix [[__LinAlgMatrix_Attributes(4, 5, 4, 1, 2)]] mat;
   __builtin_LinAlg_MatrixStoreToMemory(mat, SharedArr, 1, 2, 3);
 }
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 5dbb59102f..b72006efa9 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -55,7 +55,7 @@
 # - "a" is for any array ([n x Ty])
 # dxil_all_user_oload_chars must be kept in sync with the indices in
 # hlsl::OP::TypeSlot in DxilOperations.h.
-dxil_all_user_oload_chars = "hfd18wiluo<a"
+dxil_all_user_oload_chars = "hfd18wiluo<"
 dxil_scalar_oload_chars = "hfd18wil"
 
 # Maximum number of overload dimensions supported through the extended overload
@@ -296,8 +296,12 @@ def check_extended_oload_ops(self):
             return
         next_oload_idx = 0
         for i in self.ops:
-            if i.llvm_type.startswith("$x"):
-                if i.llvm_type != "$x" + str(next_oload_idx):
+            # _gs is extra metadata info on the overload. It has no impact on
+            # the ordering rules so it can be erased for the check.
+            # $x_gs7 -> $x7
+            ty = i.llvm_type.replace("_gs", "")
+            if ty.startswith("$x"):
+                if ty != "$x" + str(next_oload_idx):
                     raise ValueError(
                         "Extended overloads are not sequentially referenced in "
                         f"DXIL op {self.name}: {i.llvm_type} != $x{next_oload_idx}"
@@ -6407,12 +6411,12 @@ def populate_ExperimentalOps(self):
             "LinAlgMatrixLoadFromMemory",
             "LinAlgMatrixLoadFromMemory",
             "fills a matrix with data from a groupshared array",
-            "o,a",
+            "o,hfwi",
             "",
             [
                 db_dxil_param(0, "$x0", "", "resulting matrix"),
                 db_dxil_param(
-                    2, "$x1", "memory", "groupshared array to fill matrix with"
+                    2, "$x_gs1", "memory", "groupshared array to fill matrix with"
                 ),
                 db_dxil_param(3, "i32", "offset", "starting offset in the array"),
                 db_dxil_param(
@@ -6508,13 +6512,13 @@ def populate_ExperimentalOps(self):
             "LinAlgMatrixStoreToMemory",
             "LinAlgMatrixStoreToMemory",
             "stores a matrix to groupshared memory",
-            "o,a",
+            "o,hfwi",
             "",
             [
                 db_dxil_param(0, "v", "", ""),
                 db_dxil_param(2, "$x0", "matrix", "matrix to be stored"),
                 db_dxil_param(
-                    3, "$x1", "memory", "groupshared array to store into"
+                    3, "$x_gs1", "memory", "groupshared array to store into"
                 ),
                 db_dxil_param(4, "i32", "offset", "starting offset in the array"),
                 db_dxil_param(
@@ -6625,13 +6629,13 @@ def populate_ExperimentalOps(self):
             "LinAlgMatrixAccumulateToMemory",
             "LinAlgMatrixAccumulateToMemory",
             "accumulates a matrix to groupshared memory",
-            "o,a",
+            "o,hfwi",
             "",
             [
                 db_dxil_param(0, "v", "", ""),
                 db_dxil_param(2, "$x0", "matrix", "Accumulator matrix"),
                 db_dxil_param(
-                    3, "$x1", "memory", "groupshared array to accumulate into"
+                    3, "$x_gs1", "memory", "groupshared array to accumulate into"
                 ),
                 db_dxil_param(4, "i32", "offset", "starting offset in the array"),
                 db_dxil_param(
diff --git a/utils/hct/hctdb_instrhelp.py b/utils/hct/hctdb_instrhelp.py
index 5e09578af7..91636c309e 100644
--- a/utils/hct/hctdb_instrhelp.py
+++ b/utils/hct/hctdb_instrhelp.py
@@ -644,10 +644,15 @@ def print_opfunc_table(self):
             "noderecordproperty": "A(nodeRecordProperty);",
             "hit_object": "A(pHit);",
             # Extended overload slots, extend as needed:
-            "$x0": "EXT(0);",
-            "$x1": "EXT(1);",
-            "$x2": "EXT(2);",
-            "$x3": "EXT(3);",
+            "$x0": "A(EXT(0));",
+            "$x1": "A(EXT(1));",
+            "$x2": "A(EXT(2));",
+            "$x3": "A(EXT(3));",
+            # Groupshared pointers to extended overloads:
+            "$x_gs0": "TGSM(EXT(0));",
+            "$x_gs1": "TGSM(EXT(1));",
+            "$x_gs2": "TGSM(EXT(2));",
+            "$x_gs3": "TGSM(EXT(3));",
         }
         last_category = None
         for i in self.db.get_dxil_ops():
@@ -679,6 +684,7 @@ def print_opfunc_oload_type(self):
         vec_ty = "$vec"
         gsptr_ty = "$gsptr"
         extended_ty = "$x"
+        extended_gs_ty = "$x_gs"
         last_category = None
 
         index_dict = collections.OrderedDict()
@@ -846,7 +852,7 @@ def print_opfunc_oload_type(self):
             # indices the key, and add the opcode to a list of opcodes for that
             # key.  Indices start with 0 for return type, and 1 for the first
             # function parameter, which is the DXIL OpCode.
-            indices = []
+            indices = [] # (op.pos, unwrap_pointer) pairs
             for index, op in enumerate(instr.ops):
                 # Skip dxil opcode.
                 if op.pos == 1:
@@ -854,8 +860,10 @@ def print_opfunc_oload_type(self):
 
                 op_type = op.llvm_type
                 if op_type.startswith(extended_ty):
+                    gs_ptr = op_type.startswith(extended_gs_ty)
+                    prefix_len = len(extended_gs_ty) if gs_ptr else len(extended_ty)
                     try:
-                        extended_index = int(op_type[2:])
+                        extended_index = int(op_type[prefix_len:])
                     except:
                         raise ValueError(
                             "Error parsing extended operand type "
@@ -866,7 +874,7 @@ def print_opfunc_oload_type(self):
                             f"'$x{extended_index}' is not in sequential "
                             + f"order for DXIL op '{instr.name}'"
                         )
-                    indices.append(op.pos)
+                    indices.append((op.pos, gs_ptr))
 
             if len(indices) != instr.num_oloads:
                 raise ValueError(
@@ -875,23 +883,26 @@ def print_opfunc_oload_type(self):
                 )
             extended_dict.setdefault(tuple(indices), []).append(instr.name)
 
-        def get_type_at_index(index):
-            if index == 0:
-                return "FT->getReturnType()"
-            return f"FT->getParamType({index - 1})"
+        def get_type_at_index(index, unwrap_pointer):
+            result = "FT->getReturnType()"
+            if index > 0:
+                result = f"FT->getParamType({index - 1})"
+            if unwrap_pointer:
+                result = result + "->getPointerElementType()"
+            return result
 
         for index_tuple, opcodes in extended_dict.items():
             line = ""
             for opcode in opcodes:
                 line = line + f"case OpCode::{opcode}:\n"
-            if index_tuple[-1] > 0:
+            if index_tuple[-1][0] > 0:
                 line += (
-                    f"  if (FT->getNumParams() < {index_tuple[-1]})\n"
+                    f"  if (FT->getNumParams() < {index_tuple[-1][0]})\n"
                     + "    return nullptr;\n"
                 )
             line += (
                 "  return llvm::StructType::get(Ctx, {"
-                + ", ".join([get_type_at_index(index) for index in index_tuple])
+                + ", ".join([get_type_at_index(index, unwrap_pointer) for index, unwrap_pointer in index_tuple])
                 + "});\n"
             )
             print(line)

From 36b93d5f2d3cf7f148e0d96f876833390bf89609 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Wed, 11 Mar 2026 19:46:59 -0600
Subject: [PATCH 4/7] Cleanup comments

---
 utils/hct/hctdb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index b72006efa9..6e9d2920e1 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -51,8 +51,8 @@
 #     processing.
 # - "," is used to separate multiple overload dimensions.
 #   - When used, only $x0, $x1, etc. are supported for overloaded parameter
-#     types.
-# - "a" is for any array ([n x Ty])
+#     types. $x_gs0, $x_gs1, etc work like $xN except the overload will be a
+#     pointer to groupshared memory.
 # dxil_all_user_oload_chars must be kept in sync with the indices in
 # hlsl::OP::TypeSlot in DxilOperations.h.
 dxil_all_user_oload_chars = "hfd18wiluo<"

From 7ec1237f7179d5a6e8e1b023e81757ee6230956d Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Fri, 6 Mar 2026 18:14:07 -0800
Subject: [PATCH 5/7] [SM6.10] Add LinAlgMatrix ops validation tests for all
 stages

---
 .../LinAlgMatrix/linalgmatrix-as.ll           | 167 +++++++
 .../LinAlgMatrix/linalgmatrix-cs.ll           | 159 +++++++
 .../LinAlgMatrix/linalgmatrix-ds.ll           | 193 +++++++++
 .../LinAlgMatrix/linalgmatrix-gs.ll           | 198 +++++++++
 .../LinAlgMatrix/linalgmatrix-hs.ll           | 205 +++++++++
 .../LinAlgMatrix/linalgmatrix-ms.ll           | 183 ++++++++
 .../LinAlgMatrix/linalgmatrix-node.ll         | 188 ++++++++
 .../LinAlgMatrix/linalgmatrix-ps.ll           | 187 ++++++++
 .../LinAlgMatrix/linalgmatrix-raytracing.ll   | 408 ++++++++++++++++++
 .../LinAlgMatrix/linalgmatrix-vs.ll           | 188 ++++++++
 10 files changed, 2076 insertions(+)
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll
 create mode 100644 tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll

diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll
new file mode 100644
index 0000000000..3fa243952e
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll
@@ -0,0 +1,167 @@
+; REQUIRES: dxil-1-10
+; RUN: %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Validation succeeded.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%struct.AmpPayload.0 = type { [2 x float] }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @mainAS() {
+  
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+  
+  %2 = alloca %struct.AmpPayload.0, align 8
+  call void @dx.op.dispatchMesh.struct.AmpPayload.0(i32 173, i32 8, i32 1, i32 1, %struct.AmpPayload.0* nonnull %2)  ; DispatchMesh(threadGroupCountX,threadGroupCountY,threadGroupCountZ,payload)
+  
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.dispatchMesh.struct.AmpPayload.0(i32, i32, i32, i32, %struct.AmpPayload.0*) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.entryPoints = !{!9}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"as", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{void ()* @mainAS, !"mainAS", null, !6, !10}
+!10 = !{i32 0, i64 8589934608, i32 10, !11}
+!11 = !{!12, i32 8}
+!12 = !{i32 8, i32 1, i32 1}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll
new file mode 100644
index 0000000000..630ef3908a
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll
@@ -0,0 +1,159 @@
+; REQUIRES: dxil-1-10
+; RUN: %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Validation succeeded.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @mainCS() {
+  
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+  
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.entryPoints = !{!9}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"cs", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{void ()* @mainCS, !"mainCS", null, !6, !10}
+!10 = !{i32 0, i64 8589934608, i32 4, !11}
+!11 = !{i32 4, i32 4, i32 4}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll
new file mode 100644
index 0000000000..51da8f2a7d
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll
@@ -0,0 +1,193 @@
+; REQUIRES: dxil-1-10
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixMultiply not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixAccumulate not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixLength not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgCopyConvertMatrix not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgFillMatrix not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixGetElement not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixSetElement not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  MainDS: error: Function uses features incompatible with the shader stage (ds) of the entry function.
+; CHECK: Validation failed.
+
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @MainDS() {
+
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  %2 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 0)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
+  %3 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 0)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
+  %4 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 2, i32 0)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
+  %5 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 3, i32 0)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float %2)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float %3)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float %4)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float %5)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)  
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.loadInput.f32(i32, i32, i32, i8, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.storeOutput.f32(i32, i32, i32, i8, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.viewIdState = !{!9}
+!dx.entryPoints = !{!10}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"ds", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{[7 x i32] [i32 4, i32 4, i32 1, i32 2, i32 4, i32 8, i32 0]}
+!10 = !{void ()* @MainDS, !"MainDS", !11, !6, !16}
+!11 = !{!12, !12, null}
+!12 = !{!13}
+!13 = !{i32 0, !"SV_Position", i8 9, i8 3, !14, i8 4, i32 1, i8 4, i32 0, i8 0, !15}
+!14 = !{i32 0}
+!15 = !{i32 3, i32 15}
+!16 = !{i32 0, i64 8590000144, i32 2, !17}
+!17 = !{i32 2, i32 3}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll
new file mode 100644
index 0000000000..f471d3c8c2
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll
@@ -0,0 +1,198 @@
+; REQUIRES: dxil-1-10
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixMultiply not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixAccumulate not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixLength not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgCopyConvertMatrix not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgFillMatrix not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixGetElement not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixSetElement not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  MainGS: error: Function uses features incompatible with the shader stage (gs) of the entry function.
+; CHECK: Validation failed.
+
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @MainGS() {
+
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+01)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 1.000000e+01)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 1.000000e+01)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.000000e+01)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.emitStream(i32 97, i8 0)  ; EmitStream(streamId)
+  call void @dx.op.cutStream(i32 98, i8 0)  ; CutStream(streamId)
+  
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.cutStream(i32, i8) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.emitStream(i32, i8) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.storeOutput.f32(i32, i32, i32, i8, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+attributes #2 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.viewIdState = !{!9}
+!dx.entryPoints = !{!10}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"gs", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{[9 x i32] [i32 4, i32 4, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0]}
+!10 = !{void ()* @MainGS, !"MainGS", !11, !6, !18}
+!11 = !{!12, !15, null}
+!12 = !{!13}
+!13 = !{i32 0, !"SV_Position", i8 9, i8 3, !14, i8 4, i32 1, i8 4, i32 0, i8 0, null}
+!14 = !{i32 0}
+!15 = !{!16}
+!16 = !{i32 0, !"SV_Position", i8 9, i8 3, !14, i8 4, i32 1, i8 4, i32 0, i8 0, !17}
+!17 = !{i32 3, i32 15}
+!18 = !{i32 0, i64 8590000144, i32 1, !19}
+!19 = !{i32 3, i32 1, i32 1, i32 1, i32 1}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll
new file mode 100644
index 0000000000..a24cbf5cf7
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll
@@ -0,0 +1,205 @@
+; REQUIRES: dxil-1-10
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixMultiply not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixAccumulate not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixLength not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgCopyConvertMatrix not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgFillMatrix not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixGetElement not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixSetElement not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  MainHS: error: Function uses features incompatible with the shader stage (hs) of the entry function.
+; CHECK: Validation failed.
+
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @MainHS() {
+
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  ret void
+}
+
+define void @"\01?HSPatch@@YA?AUPCStruct@@V?$InputPatch@UPosStruct@@$02@@V?$OutputPatch@UPosStruct@@$02@@I@Z"() {
+  %1 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 3, i32 0)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 0, i32 0, i8 0, float %1)  ; StorePatchConstant(outputSigID,row,col,value)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 0, i32 1, i8 0, float %1)  ; StorePatchConstant(outputSigID,row,col,value)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 0, i32 2, i8 0, float %1)  ; StorePatchConstant(outputSigID,row,col,value)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 1, i32 0, i8 0, float %1)  ; StorePatchConstant(outputSigID,row,col,value)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 2, i32 0, i8 0, float undef)  ; StorePatchConstant(outputSigID,row,col,value)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 2, i32 0, i8 1, float undef)  ; StorePatchConstant(outputSigID,row,col,value)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 2, i32 0, i8 2, float undef)  ; StorePatchConstant(outputSigID,row,col,value)
+  call void @dx.op.storePatchConstant.f32(i32 106, i32 2, i32 0, i8 3, float undef)  ; StorePatchConstant(outputSigID,row,col,value)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+
+; Function Attrs: nounwind readnone
+declare float @dx.op.loadInput.f32(i32, i32, i32, i8, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.storePatchConstant.f32(i32, i32, i32, i8, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.viewIdState = !{!9}
+!dx.entryPoints = !{!10}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"hs", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{[11 x i32] [i32 4, i32 4, i32 1, i32 2, i32 4, i32 8, i32 20, i32 0, i32 0, i32 0, i32 6280]}
+!10 = !{void ()* @MainHS, !"MainHS", !11, !6, !22}
+!11 = !{!12, !12, !16}
+!12 = !{!13}
+!13 = !{i32 0, !"SV_Position", i8 9, i8 3, !14, i8 4, i32 1, i8 4, i32 0, i8 0, !15}
+!14 = !{i32 0}
+!15 = !{i32 3, i32 15}
+!16 = !{!17, !20, !21}
+!17 = !{i32 0, !"SV_TessFactor", i8 9, i8 25, !18, i8 0, i32 3, i8 1, i32 0, i8 3, !19}
+!18 = !{i32 0, i32 1, i32 2}
+!19 = !{i32 3, i32 1}
+!20 = !{i32 1, !"SV_InsideTessFactor", i8 9, i8 26, !14, i8 0, i32 1, i8 1, i32 3, i8 0, !19}
+!21 = !{i32 2, !"TEST", i8 9, i8 0, !14, i8 0, i32 1, i8 4, i32 4, i8 0, !15}
+!22 = !{i32 0, i64 8590000144, i32 3, !23}
+!23 = !{void ()* @"\01?HSPatch@@YA?AUPCStruct@@V?$InputPatch@UPosStruct@@$02@@V?$OutputPatch@UPosStruct@@$02@@I@Z", i32 3, i32 3, i32 2, i32 3, i32 3, float 6.400000e+01}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll
new file mode 100644
index 0000000000..199a63ccf4
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll
@@ -0,0 +1,183 @@
+; REQUIRES: dxil-1-10
+; RUN: %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Validation succeeded.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @mainMeS() {
+
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  %thread_id_group = call i32 @dx.op.flattenedThreadIdInGroup.i32(i32 96)  ; FlattenedThreadIdInGroup()
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  call void @dx.op.setMeshOutputCounts(i32 168, i32 32, i32 16)  ; SetMeshOutputCounts(numVertices,numPrimitives)
+  call void @dx.op.storeVertexOutput.f32(i32 171, i32 0, i32 0, i8 0, float 0.000000e+00, i32 %thread_id_group)  ; StoreVertexOutput(outputSigId,rowIndex,colIndex,value,vertexIndex)
+  call void @dx.op.storeVertexOutput.f32(i32 171, i32 0, i32 0, i8 1, float 0.000000e+00, i32 %thread_id_group)  ; StoreVertexOutput(outputSigId,rowIndex,colIndex,value,vertexIndex)
+  call void @dx.op.storeVertexOutput.f32(i32 171, i32 0, i32 0, i8 2, float 0.000000e+00, i32 %thread_id_group)  ; StoreVertexOutput(outputSigId,rowIndex,colIndex,value,vertexIndex)
+  call void @dx.op.storeVertexOutput.f32(i32 171, i32 0, i32 0, i8 3, float 0.000000e+00, i32 %thread_id_group)  ; StoreVertexOutput(outputSigId,rowIndex,colIndex,value,vertexIndex)
+  
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+; Function Attrs: nounwind readnone
+declare i32 @dx.op.flattenedThreadIdInGroup.i32(i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.storeVertexOutput.f32(i32, i32, i32, i8, float, i32) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.setMeshOutputCounts(i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.viewIdState = !{!9}
+!dx.entryPoints = !{!10}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"ms", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{[3 x i32] [i32 0, i32 4, i32 0]}
+!10 = !{void ()* @mainMeS, !"mainMeS", !11, !6, !16}
+!11 = !{null, !12, null}
+!12 = !{!13}
+!13 = !{i32 0, !"SV_Position", i8 9, i8 3, !14, i8 4, i32 1, i8 4, i32 0, i8 0, !15}
+!14 = !{i32 0}
+!15 = !{i32 3, i32 15}
+!16 = !{i32 0, i64 8589934608, i32 9, !17}
+!17 = !{!18, i32 32, i32 0, i32 2, i32 0}
+!18 = !{i32 8, i32 8, i32 2}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll
new file mode 100644
index 0000000000..b4280ba682
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll
@@ -0,0 +1,188 @@
+; REQUIRES: dxil-1-10
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixMultiply not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixAccumulate not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixLength not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgCopyConvertMatrix not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgFillMatrix not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  mainNS: error: Function uses features incompatible with the shader stage (node) of the entry function.
+; CHECK: Validation failed.
+
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+define void @mainNS() {
+
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+  
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.storeOutput.f32(i32, i32, i32, i8, float) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.typeAnnotations = !{!9}
+!dx.entryPoints = !{!13, !15}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"lib", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"buf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{i32 1, void ()* @mainNS, !10}
+!10 = !{!11}
+!11 = !{i32 1, !12, !12}
+!12 = !{}
+!13 = !{null, !"", null, !6, !14}
+!14 = !{i32 0, i64 8589934608}
+!15 = !{void ()* @mainNS, !"mainNS", null, null, !16}
+!16 = !{i32 8, i32 15, i32 13, i32 1, i32 15, !17, i32 16, i32 -1, i32 18, !18, i32 4, !19, i32 5, !20}
+!17 = !{!"mainNS", i32 0}
+!18 = !{i32 8, i32 1, i32 1}
+!19 = !{i32 64, i32 2, i32 2}
+!20 = !{i32 0}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll
new file mode 100644
index 0000000000..2e3a6ef71f
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll
@@ -0,0 +1,187 @@
+; REQUIRES: dxil-1-10
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixMultiply not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixAccumulate not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixLength not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgCopyConvertMatrix not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgFillMatrix not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixGetElement not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixSetElement not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  mainPS: error: Function uses features incompatible with the shader stage (ps) of the entry function.
+; CHECK: Validation failed.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @mainPS() {
+
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.storeOutput.f32(i32, i32, i32, i8, float) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.viewIdState = !{!9}
+!dx.entryPoints = !{!10}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"ps", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{[3 x i32] [i32 1, i32 4, i32 0]}
+!10 = !{void ()* @mainPS, !"mainPS", !11, !6, !18}
+!11 = !{!12, !15, null}
+!12 = !{!13}
+!13 = !{i32 0, !"SV_PrimitiveID", i8 5, i8 10, !14, i8 1, i32 1, i8 1, i32 0, i8 0, null}
+!14 = !{i32 0}
+!15 = !{!16}
+!16 = !{i32 0, !"SV_Target", i8 9, i8 16, !14, i8 0, i32 1, i8 4, i32 0, i8 0, !17}
+!17 = !{i32 3, i32 15}
+!18 = !{i32 0, i64 8589934608}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll
new file mode 100644
index 0000000000..c627b5e4cc
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll
@@ -0,0 +1,408 @@
+; REQUIRES: dxil-1-10
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixMultiply not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixMultiply not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixMultiply not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixMultiply not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixMultiply not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixMultiply not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixAccumulate not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixAccumulate not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixAccumulate not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixAccumulate not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixAccumulate not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixAccumulate not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixLength not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixLength not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixLength not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixLength not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixLength not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixLength not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgCopyConvertMatrix not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgCopyConvertMatrix not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgCopyConvertMatrix not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgCopyConvertMatrix not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgCopyConvertMatrix not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgCopyConvertMatrix not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgFillMatrix not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgFillMatrix not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgFillMatrix not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgFillMatrix not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgFillMatrix not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgFillMatrix not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Function uses features incompatible with the shader stage (raygeneration) of the entry function.
+
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Function uses features incompatible with the shader stage (intersection) of the entry function.
+
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Function uses features incompatible with the shader stage (callable) of the entry function.
+
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Function uses features incompatible with the shader stage (anyhit) of the entry function.
+
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Function uses features incompatible with the shader stage (closesthit) of the entry function.
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Function uses features incompatible with the shader stage (miss) of the entry function.
+
+; CHECK: Validation failed.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+%dx.types.Handle = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.Attribs = type { <2 x float> }
+%struct.RayPayload = type { float }
+%struct.RWByteAddressBuffer = type { i32 }
+
+@"\01?buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+
+define void @"\01?MainRG@@YAXXZ"() #0 {
+  
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ;
+  ; Built-ins allowed in all stages
+  ;
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+  
+  ret void
+}
+
+define void @"\01?MainIS@@YAXXZ"() #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ;
+  ; Built-ins allowed in all stages
+  ;
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  ret void
+}
+
+define void @"\01?MainCL@@YAXUAttribs@@@Z"(%struct.Attribs* noalias nocapture %attrs) #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ;
+  ; Built-ins allowed in all stages
+  ;
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  ret void
+}
+
+define void @"\01?MainAH@@YAXURayPayload@@UAttribs@@@Z"(%struct.RayPayload* noalias nocapture %pld, %struct.Attribs* nocapture readnone %attrs) #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ;
+  ; Built-ins allowed in all stages
+  ;
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  ret void
+}
+
+define void @"\01?MainCH@@YAXURayPayload@@UAttribs@@@Z"(%struct.RayPayload* noalias nocapture %pld, %struct.Attribs* nocapture readnone %attrs) #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ;
+  ; Built-ins allowed in all stages
+  ;
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  ret void
+}
+
+define void @"\01?MainMS@@YAXURayPayload@@@Z"(%struct.RayPayload* noalias nocapture %pld) #0 {
+  %1 = load %dx.types.Handle, %dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A", align 4
+  %2 = call %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32 160, %dx.types.Handle %1)  ; CreateHandleForLib(Resource)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %2, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+  ;
+  ; Built-ins allowed in all stages
+  ;
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readonly
+declare %dx.types.Handle @dx.op.createHandleForLib.dx.types.Handle(i32, %dx.types.Handle) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.typeAnnotations = !{!9}
+!dx.dxrPayloadAnnotations = !{!17}
+!dx.entryPoints = !{!20, !22, !25, !27, !29, !31, !33}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (Main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"lib", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* bitcast (%dx.types.Handle* @"\01?buf@@3URWByteAddressBuffer@@A" to %struct.RWByteAddressBuffer*), !"buf", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{i32 1, void ()* @"\01?MainRG@@YAXXZ", !10, void ()* @"\01?MainIS@@YAXXZ", !10, void (%struct.Attribs*)* @"\01?MainCL@@YAXUAttribs@@@Z", !13, void (%struct.RayPayload*, %struct.Attribs*)* @"\01?MainAH@@YAXURayPayload@@UAttribs@@@Z", !15, void (%struct.RayPayload*, %struct.Attribs*)* @"\01?MainCH@@YAXURayPayload@@UAttribs@@@Z", !15, void (%struct.RayPayload*)* @"\01?MainMS@@YAXURayPayload@@@Z", !13}
+!10 = !{!11}
+!11 = !{i32 1, !12, !12}
+!12 = !{}
+!13 = !{!11, !14}
+!14 = !{i32 2, !12, !12}
+!15 = !{!11, !14, !16}
+!16 = !{i32 0, !12, !12}
+!17 = !{i32 0, %struct.RayPayload undef, !18}
+!18 = !{!19}
+!19 = !{i32 0, i32 13107}
+!20 = !{null, !"", null, !6, !21}
+!21 = !{i32 0, i64 8589934608}
+!22 = !{void ()* @"\01?MainRG@@YAXXZ", !"\01?MainRG@@YAXXZ", null, null, !23}
+!23 = !{i32 8, i32 7, i32 5, !24}
+!24 = !{i32 0}
+!25 = !{void (%struct.RayPayload*, %struct.Attribs*)* @"\01?MainAH@@YAXURayPayload@@UAttribs@@@Z", !"\01?MainAH@@YAXURayPayload@@UAttribs@@@Z", null, null, !26}
+!26 = !{i32 8, i32 9, i32 6, i32 4, i32 7, i32 8, i32 5, !24}
+!27 = !{void (%struct.Attribs*)* @"\01?MainCL@@YAXUAttribs@@@Z", !"\01?MainCL@@YAXUAttribs@@@Z", null, null, !28}
+!28 = !{i32 8, i32 12, i32 6, i32 8, i32 5, !24}
+!29 = !{void (%struct.RayPayload*, %struct.Attribs*)* @"\01?MainCH@@YAXURayPayload@@UAttribs@@@Z", !"\01?MainCH@@YAXURayPayload@@UAttribs@@@Z", null, null, !30}
+!30 = !{i32 8, i32 10, i32 6, i32 4, i32 7, i32 8, i32 5, !24}
+!31 = !{void ()* @"\01?MainIS@@YAXXZ", !"\01?MainIS@@YAXXZ", null, null, !32}
+!32 = !{i32 8, i32 8, i32 5, !24}
+!33 = !{void (%struct.RayPayload*)* @"\01?MainMS@@YAXURayPayload@@@Z", !"\01?MainMS@@YAXURayPayload@@@Z", null, null, !34}
+!34 = !{i32 8, i32 11, i32 6, i32 4, i32 5, !24}
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll
new file mode 100644
index 0000000000..09a3753a6a
--- /dev/null
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll
@@ -0,0 +1,188 @@
+; REQUIRES: dxil-1-10
+; RUN: not %dxv %s 2>&1 | FileCheck %s
+
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixMultiply not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixAccumulate not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixStoreToDescriptor not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixLength not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgCopyConvertMatrix not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgFillMatrix not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixGetCoordinate not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixGetElement not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixSetElement not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
+; CHECK: Function:  mainVS: error: Function uses features incompatible with the shader stage (vs) of the entry function.
+; CHECK: Validation failed.
+
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%dx.types.Handle = type { i8* }
+%dx.types.ResBind = type { i32, i32, i32, i8 }
+%dx.types.LinAlgMatrixC4M5N4U2S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M5N4U0S2 = type { i8* }
+%dx.types.LinAlgMatrixC4M4N5U1S2 = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+%struct.RWByteAddressBuffer = type { i32 }
+
+define void @mainVS() {
+
+  %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
+  %handle = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })  ; AnnotateHandle(res,props)  resource: RWByteAddressBuffer
+
+  ;
+  ; Built-ins allowed in all stages
+  ;
+
+  ; dx.op.linAlgMatrixAccumulate
+  %v1 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483624, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.LinAlgMatrixC4M4N5U1S2 undef)  ; LinAlgMatrixAccumulate(matrixLHS,matrixRHS)
+  
+  ; dx.op.linAlgMatrixAccumulateToDescriptor
+  call void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32 -2147483621, %dx.types.LinAlgMatrixC4M5N4U0S2 undef, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixAccumulateToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLength
+  %v2 = call i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32 -2147483632, %dx.types.LinAlgMatrixC4M5N4U0S2 undef)  ; LinAlgMatrixLength(matrix)
+  
+  ; dx.op.linAlgMatrixLoadFromDescriptor
+  %v3 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32 -2147483634, %dx.types.Handle %handle, i32 5, i32 5, i32 5)  ; LinAlgMatrixLoadFromDescriptor(handle,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixOuterProduct
+  %v4 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483619, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)  ; LinAlgMatrixOuterProduct(vectorA,vectorB)
+ 
+  ; dx.op.linAlgMatrixQueryAccumulatorLayout
+  %v5 = call i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32 -2147483626)  ; LinAlgMatrixQueryAccumulatorLayout()
+  
+  ; dx.op.linAlgMatVecMul
+  %v6 = call <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32 -2147483623, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 1)  ; LinAlgMatVecMul(matrix,inputVector,interpretation)
+  
+  ; dx.op.linAlgMatVecMulAdd
+  %v7 = call <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32 -2147483622, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, <4 x i32> <i32 9, i32 9, i32 9, i32 9>, i32 2, <4 x i32> <i32 7, i32 7, i32 7, i32 7>, i32 3)  ; LinAlgMatVecMulAdd(matrix,inputVector,inputInterpretation,biasVector,biasInterpretation)
+  
+  ;
+  ; Built-ins restricted to compute, mesh and amplification shaders
+  ;
+
+  ; dx.op.linAlgCopyConvertMatrix
+  %v8 = call %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32 -2147483635, %dx.types.LinAlgMatrixC4M5N4U0S2 %v4, i1 true)  ; LinAlgCopyConvertMatrix(srcMatrix,transpose)
+ 
+  ; dx.op.linAlgFillMatrix
+  %v9 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32 -2147483636, i32 15)  ; LinAlgFillMatrix(value)
+  
+  ; dx.op.linAlgMatrixGetCoordinate
+  %v10 = call <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32 -2147483631, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetCoordinate(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixGetElement
+  %v11 = call float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32 -2147483630, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 0)  ; LinAlgMatrixGetElement(matrix,threadLocalIndex)
+  
+  ; dx.op.linAlgMatrixMultiply
+  %v12 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32 -2147483625, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8)  ; LinAlgMatrixMultiply(matrixA,matrixB)
+  
+  ; dx.op.linAlgMatrixMultiplyAccumulate
+  %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
+  
+  ; dx.op.linAlgMatrixSetElement
+  %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
+
+  ; dx.op.linAlgMatrixStoreToDescriptor
+  call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  
+  ; FIXME: 3 more ops coming soon
+
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 2, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 3, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
+  
+  ret void
+}
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiply.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToDescriptor.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixLength.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromDescriptor.mC4M5N4U0S2(i32, %dx.types.Handle, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixOuterProduct.mC4M5N4U0S2.v4i32.v4i32(i32, <4 x i32>, <4 x i32>) #0
+
+; Function Attrs: nounwind
+declare i32 @dx.op.linAlgMatrixQueryAccumulatorLayout(i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMul.v4i32.mC4M5N4U0S2.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare <4 x i32> @dx.op.linAlgMatVecMulAdd.v4i32.mC4M5N4U0S2.v4i32.v4i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, <4 x i32>, i32, <4 x i32>, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M4N5U1S2 @dx.op.linAlgCopyConvertMatrix.mC4M4N5U1S2.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i1) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgFillMatrix.mC4M5N4U0S2.i32(i32, i32) #0
+
+; Function Attrs: nounwind
+declare <2 x i32> @dx.op.linAlgMatrixGetCoordinate.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare float @dx.op.linAlgMatrixGetElement.f32.mC4M5N4U0S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, %dx.types.LinAlgMatrixC4M4N5U1S2, %dx.types.LinAlgMatrixC4M5N4U2S2) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
+
+; Function Attrs: nounwind readnone
+declare %dx.types.Handle @dx.op.createHandleFromBinding(i32, %dx.types.ResBind, i32, i1) #1
+
+; Function Attrs: nounwind
+declare void @dx.op.storeOutput.f32(i32, i32, i32, i8, float) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!dx.targetTypes = !{!0, !1, !2}
+!llvm.ident = !{!3}
+!dx.version = !{!4}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.resources = !{!6}
+!dx.viewIdState = !{!9}
+!dx.entryPoints = !{!10}
+
+!0 = !{%dx.types.LinAlgMatrixC4M5N4U0S2 undef, i32 4, i32 5, i32 4, i32 0, i32 2}
+!1 = !{%dx.types.LinAlgMatrixC4M4N5U1S2 undef, i32 4, i32 4, i32 5, i32 1, i32 2}
+!2 = !{%dx.types.LinAlgMatrixC4M5N4U2S2 undef, i32 4, i32 5, i32 4, i32 2, i32 2}
+!3 = !{!"dxc(private) 1.9.0.15241 (main, 1f63535ae)"}
+!4 = !{i32 1, i32 10}
+!5 = !{!"vs", i32 6, i32 10}
+!6 = !{null, !7, null, null}
+!7 = !{!8}
+!8 = !{i32 0, %struct.RWByteAddressBuffer* undef, !"", i32 0, i32 0, i32 1, i32 11, i1 false, i1 false, i1 false, null}
+!9 = !{[3 x i32] [i32 1, i32 4, i32 0]}
+!10 = !{void ()* @mainVS, !"mainVS", !11, !6, !18}
+!11 = !{!12, !15, null}
+!12 = !{!13}
+!13 = !{i32 0, !"SV_VertexID", i8 5, i8 1, !14, i8 0, i32 1, i8 1, i32 0, i8 0, null}
+!14 = !{i32 0}
+!15 = !{!16}
+!16 = !{i32 0, !"OUT", i8 9, i8 0, !14, i8 2, i32 1, i8 4, i32 0, i8 0, !17}
+!17 = !{i32 3, i32 15}
+!18 = !{i32 0, i64 8590000144}

From f0872f435cdba01ee1272e7fc3b3b4ab90c49839 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Fri, 13 Mar 2026 12:57:50 -0700
Subject: [PATCH 6/7] Add last 3 ops

---
 .../LinAlgMatrix/linalgmatrix-as.ll           | 20 +++++-
 .../LinAlgMatrix/linalgmatrix-cs.ll           | 20 +++++-
 .../LinAlgMatrix/linalgmatrix-ds.ll           | 23 ++++++-
 .../LinAlgMatrix/linalgmatrix-gs.ll           | 23 ++++++-
 .../LinAlgMatrix/linalgmatrix-hs.ll           | 23 ++++++-
 .../LinAlgMatrix/linalgmatrix-ms.ll           | 20 +++++-
 .../LinAlgMatrix/linalgmatrix-node.ll         | 22 ++++++-
 .../LinAlgMatrix/linalgmatrix-ps.ll           | 23 ++++++-
 .../LinAlgMatrix/linalgmatrix-raytracing.ll   | 63 +++++++++++++++----
 .../LinAlgMatrix/linalgmatrix-vs.ll           | 23 ++++++-
 10 files changed, 238 insertions(+), 22 deletions(-)

diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll
index 3fa243952e..8295d09ba7 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-as.ll
@@ -15,6 +15,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @mainAS() {
   
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -76,8 +78,15 @@ define void @mainAS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
   
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
+
   %2 = alloca %struct.AmpPayload.0, align 8
   call void @dx.op.dispatchMesh.struct.AmpPayload.0(i32 173, i32 8, i32 1, i32 1, %struct.AmpPayload.0* nonnull %2)  ; DispatchMesh(threadGroupCountX,threadGroupCountY,threadGroupCountZ,payload)
   
@@ -132,6 +141,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll
index 630ef3908a..68e4bf24a6 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-cs.ll
@@ -14,6 +14,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @mainCS() {
   
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -75,7 +77,14 @@ define void @mainCS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   
   ret void
 }
@@ -128,6 +137,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll
index 51da8f2a7d..6f29147319 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ds.ll
@@ -11,6 +11,9 @@
 ; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixGetElement not valid in shader model ds_6_10.
 ; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model ds_6_10.
 ; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixSetElement not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model ds_6_10.
+; CHECK: Function:  MainDS: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model ds_6_10.
 ; CHECK: Function:  MainDS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
 ; CHECK: Function:  MainDS: error: Function uses features incompatible with the shader stage (ds) of the entry function.
 ; CHECK: Validation failed.
@@ -27,6 +30,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @MainDS() {
 
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -88,7 +93,14 @@ define void @MainDS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
 
   %2 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 0, i32 0)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
   %3 = call float @dx.op.loadInput.f32(i32 4, i32 0, i32 0, i8 1, i32 0)  ; LoadInput(inputSigId,rowIndex,colIndex,gsVertexAxis)
@@ -149,6 +161,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll
index f471d3c8c2..a56a3d1e0b 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-gs.ll
@@ -11,6 +11,9 @@
 ; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixGetElement not valid in shader model gs_6_10.
 ; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model gs_6_10.
 ; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixSetElement not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model gs_6_10.
+; CHECK: Function:  MainGS: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model gs_6_10.
 ; CHECK: Function:  MainGS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
 ; CHECK: Function:  MainGS: error: Function uses features incompatible with the shader stage (gs) of the entry function.
 ; CHECK: Validation failed.
@@ -27,6 +30,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @MainGS() {
 
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -88,7 +93,14 @@ define void @MainGS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+    ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
 
   call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+01)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
   call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 1.000000e+01)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
@@ -148,6 +160,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll
index a24cbf5cf7..6dcc3accfb 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-hs.ll
@@ -11,6 +11,9 @@
 ; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixGetElement not valid in shader model hs_6_10.
 ; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model hs_6_10.
 ; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixSetElement not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model hs_6_10.
+; CHECK: Function:  MainHS: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model hs_6_10.
 ; CHECK: Function:  MainHS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
 ; CHECK: Function:  MainHS: error: Function uses features incompatible with the shader stage (hs) of the entry function.
 ; CHECK: Validation failed.
@@ -27,6 +30,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @MainHS() {
 
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -88,7 +93,14 @@ define void @MainHS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
 
   ret void
 }
@@ -154,6 +166,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll
index 199a63ccf4..9f04cb0d46 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ms.ll
@@ -14,6 +14,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @mainMeS() {
 
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -76,7 +78,14 @@ define void @mainMeS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
 
   call void @dx.op.setMeshOutputCounts(i32 168, i32 32, i32 16)  ; SetMeshOutputCounts(numVertices,numPrimitives)
   call void @dx.op.storeVertexOutput.f32(i32 171, i32 0, i32 0, i8 0, float 0.000000e+00, i32 %thread_id_group)  ; StoreVertexOutput(outputSigId,rowIndex,colIndex,value,vertexIndex)
@@ -135,6 +144,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll
index b4280ba682..c77999c15c 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-node.ll
@@ -11,6 +11,9 @@
 ; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixGetElement not valid in shader model lib_6_10(node).
 ; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model lib_6_10(node).
 ; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model lib_6_10(node).
+; CHECK: Function:  mainNS: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model lib_6_10(node).
 ; CHECK: Function:  mainNS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
 ; CHECK: Function:  mainNS: error: Function uses features incompatible with the shader stage (node) of the entry function.
 ; CHECK: Validation failed.
@@ -28,6 +31,7 @@ target triple = "dxil-ms-dx"
 %struct.RWByteAddressBuffer = type { i32 }
 
 @"\01?buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
 
 define void @mainNS() {
 
@@ -90,7 +94,14 @@ define void @mainNS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   
   ret void
 }
@@ -143,6 +154,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll
index 2e3a6ef71f..7b8072e0ce 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-ps.ll
@@ -11,6 +11,9 @@
 ; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixGetElement not valid in shader model ps_6_10.
 ; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model ps_6_10.
 ; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixSetElement not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model ps_6_10.
+; CHECK: Function:  mainPS: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model ps_6_10.
 ; CHECK: Function:  mainPS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
 ; CHECK: Function:  mainPS: error: Function uses features incompatible with the shader stage (ps) of the entry function.
 ; CHECK: Validation failed.
@@ -26,6 +29,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @mainPS() {
 
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -87,7 +92,14 @@ define void @mainPS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
 
   call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
   call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
@@ -145,6 +157,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll
index c627b5e4cc..14588f4b7f 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-raytracing.ll
@@ -71,6 +71,27 @@
 ; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(intersection).
 ; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixSetElement not valid in shader model lib_6_10(raygeneration).
 
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model lib_6_10(raygeneration).
+
+; CHECK: Function:  {{.*}}MainMS{{.*}}: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model lib_6_10(miss).
+; CHECK: Function:  {{.*}}MainCH{{.*}}: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model lib_6_10(closesthit).
+; CHECK: Function:  {{.*}}MainAH{{.*}}: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model lib_6_10(anyhit).
+; CHECK: Function:  {{.*}}MainCL{{.*}}: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model lib_6_10(callable).
+; CHECK: Function:  {{.*}}MainIS{{.*}}: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model lib_6_10(intersection).
+; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model lib_6_10(raygeneration).
+
 ; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
 ; CHECK: Function:  {{.*}}MainRG{{.*}}: error: Function uses features incompatible with the shader stage (raygeneration) of the entry function.
 
@@ -103,6 +124,7 @@ target triple = "dxil-ms-dx"
 %struct.RWByteAddressBuffer = type { i32 }
 
 @"\01?buf@@3URWByteAddressBuffer@@A" = external constant %dx.types.Handle, align 4
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
 
 define void @"\01?MainRG@@YAXXZ"() #0 {
   
@@ -132,8 +154,9 @@ define void @"\01?MainRG@@YAXXZ"() #0 {
   %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
   %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
-  
-  ; FIXME: 3 more ops coming soon
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   
   ret void
 }
@@ -165,9 +188,10 @@ define void @"\01?MainIS@@YAXXZ"() #0 {
   %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
   %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
-  
-  ; FIXME: 3 more ops coming soon
-
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
+ 
   ret void
 }
 
@@ -198,9 +222,10 @@ define void @"\01?MainCL@@YAXUAttribs@@@Z"(%struct.Attribs* noalias nocapture %a
   %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
   %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
-
   ret void
 }
 
@@ -231,9 +256,10 @@ define void @"\01?MainAH@@YAXURayPayload@@UAttribs@@@Z"(%struct.RayPayload* noal
   %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
   %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
-
   ret void
 }
 
@@ -264,9 +290,10 @@ define void @"\01?MainCH@@YAXURayPayload@@UAttribs@@@Z"(%struct.RayPayload* noal
   %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
   %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
-
   ret void
 }
 
@@ -297,9 +324,10 @@ define void @"\01?MainMS@@YAXURayPayload@@@Z"(%struct.RayPayload* noalias nocapt
   %v13 = call %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.mC4M5N4U2S2.mC4M5N4U0S2.mC4M4N5U1S2.mC4M5N4U2S2(i32 -2147483637, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, %dx.types.LinAlgMatrixC4M4N5U1S2 %v8, %dx.types.LinAlgMatrixC4M5N4U2S2 %v12)  ; LinAlgMatrixMultiplyAccumulate(matrixA,matrixB,matrixC)
   %v14 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32 -2147483629, %dx.types.LinAlgMatrixC4M5N4U0S2 %v9, i32 1, i32 1)  ; LinAlgMatrixSetElement(matrix,threadLocalIndex,value)
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
-
   ret void
 }
 
@@ -351,6 +379,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 
diff --git a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll
index 09a3753a6a..cacff8b532 100644
--- a/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll
+++ b/tools/clang/test/LitDXILValidation/LinAlgMatrix/linalgmatrix-vs.ll
@@ -11,6 +11,9 @@
 ; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixGetElement not valid in shader model vs_6_10.
 ; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixMultiplyAccumulate not valid in shader model vs_6_10.
 ; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixSetElement not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixStoreToMemory not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixAccumulateToMemory not valid in shader model vs_6_10.
+; CHECK: Function:  mainVS: error: Opcode LinAlgMatrixLoadFromMemory not valid in shader model vs_6_10.
 ; CHECK: Function:  mainVS: error: Entry function performs some operation that is incompatible with the shader stage or other entry properties.  See other errors for details.
 ; CHECK: Function:  mainVS: error: Function uses features incompatible with the shader stage (vs) of the entry function.
 ; CHECK: Validation failed.
@@ -27,6 +30,8 @@ target triple = "dxil-ms-dx"
 %dx.types.ResourceProperties = type { i32, i32 }
 %struct.RWByteAddressBuffer = type { i32 }
 
+@"\01?SharedArr@@3PAMA" = external addrspace(3) global [64 x float], align 4
+
 define void @mainVS() {
 
   %1 = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 0, i32 0, i32 0, i8 1 }, i32 0, i1 false)  ; CreateHandleFromBinding(bind,index,nonUniformIndex)
@@ -88,7 +93,14 @@ define void @mainVS() {
   ; dx.op.linAlgMatrixStoreToDescriptor
   call void @dx.op.linAlgMatrixStoreToDescriptor.mC4M5N4U0S2(i32 -2147483628, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, %dx.types.Handle %handle, i32 1, i32 2, i32 3)  ; LinAlgMatrixStoreToDescriptor(matrix,handle,offset,stride,layout)
   
-  ; FIXME: 3 more ops coming soon
+  ; dx.op.linAlgMatrixAccumulateToMemory
+  call void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32 -2147483620, %dx.types.LinAlgMatrixC4M5N4U0S2 %v14, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixAccumulateToMemory(matrix,memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixLoadFromMemory
+  %v15 = call %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32 -2147483633, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixLoadFromMemory(memory,offset,stride,layout)
+  
+  ; dx.op.linAlgMatrixStoreToMemory
+  call void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32 -2147483627, %dx.types.LinAlgMatrixC4M5N4U0S2 %v15, float addrspace(3)* getelementptr inbounds ([64 x float], [64 x float] addrspace(3)* @"\01?SharedArr@@3PAMA", i32 0, i32 0), i32 0, i32 0, i32 0)  ; LinAlgMatrixStoreToMemory(matrix,memory,offset,stride,layout)
 
   call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 0, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
   call void @dx.op.storeOutput.f32(i32 5, i32 0, i32 0, i8 1, float 1.000000e+00)  ; StoreOutput(outputSigId,rowIndex,colIndex,value)
@@ -146,6 +158,15 @@ declare %dx.types.LinAlgMatrixC4M5N4U2S2 @dx.op.linAlgMatrixMultiplyAccumulate.m
 ; Function Attrs: nounwind
 declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixSetElement.mC4M5N4U0S2.mC4M5N4U0S2.i32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, i32, i32) #0
 
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixStoreToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @dx.op.linAlgMatrixAccumulateToMemory.mC4M5N4U0S2.f32(i32, %dx.types.LinAlgMatrixC4M5N4U0S2, float addrspace(3)*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare %dx.types.LinAlgMatrixC4M5N4U0S2 @dx.op.linAlgMatrixLoadFromMemory.mC4M5N4U0S2.f32(i32, float addrspace(3)*, i32, i32, i32) #0
+
 ; Function Attrs: nounwind readnone
 declare %dx.types.Handle @dx.op.annotateHandle(i32, %dx.types.Handle, %dx.types.ResourceProperties) #1
 

From c64d0a347a01a4e9abc521db8964f8979a6271ef Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Tue, 17 Mar 2026 17:36:01 -0700
Subject: [PATCH 7/7] Fix merge (revert hctdb.py change)

---
 utils/hct/hctdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 74515d6a4a..e88834aa62 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -9601,7 +9601,7 @@ def __init__(self, intrinsic_defs, opcode_data):
             "out": "AR_QUAL_OUT",
             "col_major": "AR_QUAL_COLMAJOR",
             "row_major": "AR_QUAL_ROWMAJOR",
-            "groupshared": "AR_QUAL_IN | AR_QUAL_GROUPSHARED",
+            "groupshared": "AR_QUAL_GROUPSHARED",
         }
         self.intrinsics = []
         self.load_intrinsics(intrinsic_defs)