diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index 0b4c7218d4..7047d9fe59 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -96,16 +96,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::Saturate,
      "Saturate",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::IsNaN,
      "IsNaN",
      OCC::IsSpecialFloat,
@@ -144,112 +144,112 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Sin,
      "Sin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Tan,
      "Tan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Acos,
      "Acos",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Asin,
      "Asin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Atan,
      "Atan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Hcos,
      "Hcos",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Hsin,
      "Hsin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Htan,
      "Htan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Exp,
      "Exp",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Frc,
      "Frc",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Log,
      "Log",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Sqrt,
      "Sqrt",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Rsqrt,
      "Rsqrt",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Unary float - rounding
     {OC::Round_ne,
@@ -258,32 +258,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_ni,
      "Round_ni",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_pi,
      "Round_pi",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_z,
      "Round_z",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Unary int
     {OC::Bfrev,
@@ -292,8 +292,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::Countbits,
      "Countbits",
      OCC::UnaryBits,
@@ -338,16 +338,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::FMin,
      "FMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
 
     // Binary int
     {OC::IMax,
@@ -356,16 +356,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::IMin,
      "IMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Binary uint
     {OC::UMax,
@@ -374,16 +374,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::UMin,
      "UMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Binary int with two outputs
     {OC::IMul,
@@ -438,16 +438,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::Fma,
      "Fma",
      OCC::Tertiary,
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0x4}},
-     {{0x0}}}, // Overloads: d
+     {{0x404}},
+     {{0x4}}}, // Overloads: d<d
 
     // Tertiary int
     {OC::IMad,
@@ -456,8 +456,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Tertiary uint
     {OC::UMad,
@@ -466,8 +466,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Tertiary int
     {OC::Msad,
@@ -764,32 +764,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivCoarseY,
      "DerivCoarseY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineX,
      "DerivFineX",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineY,
      "DerivFineY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Pixel shader
     {OC::EvalSnapped,
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 4d8201df8d..a2bab818a8 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -424,6 +424,14 @@ struct IntrinsicLower {
 // IOP intrinsics.
 namespace {
 
+// Creates the necessary scalar calls to for a "trivial" operation where only
+// call instructions to a single function type are needed.
+// The overload type `Ty` determines what scalarization might be required.
+// Elements of any vectors in `refArgs` are extracted  into scalars for each
+// call generated while the same scalar values are used unaltered in each call.
+// Utility objects `HlslOp` and `Builder` are used to generate calls to the
+// given `DxilFunc` for each set of scalar arguments.
+// The results are reconstructed into the given `RetTy` as needed.
 Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode,
                             ArrayRef<Value *> refArgs, Type *Ty, Type *RetTy,
                             OP *hlslOP, IRBuilder<> &Builder) {
@@ -459,17 +467,42 @@ Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode,
     }
   }
 }
-// Generates a DXIL operation over an overloaded type (Ty), returning a
-// RetTy value; when Ty is a vector, it will replicate per-element operations
-// into RetTy to rebuild it.
-Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
-                            Type *Ty, Type *RetTy, OP *hlslOP,
-                            IRBuilder<> &Builder) {
+
+// Creates a native vector call to for a "trivial" operation where only a single
+// call instruction is needed. The overload and return types are the same vector
+// type `Ty`.
+// Utility objects `HlslOp` and `Builder` are used to create a call to the given
+// `DxilFunc` with `RefArgs` arguments.
+Value *TrivialDxilVectorOperation(Function *Func, OP::OpCode Opcode,
+                                  ArrayRef<Value *> Args, Type *Ty, OP *OP,
+                                  IRBuilder<> &Builder) {
+  if (!Ty->isVoidTy())
+    return Builder.CreateCall(Func, Args, OP->GetOpCodeName(Opcode));
+  else
+    return Builder.CreateCall(Func, Args); // Cannot add name to void.
+}
+
+// Generates a DXIL operation with the overloaded type based on `Ty` and return
+// type `RetTy`. When Ty is a vector, it will either generate per-element calls
+// for each vector element and reconstruct the vector type from those results or
+// operate on and return native vectors depending on vector size and the value
+// of `SupportsVectors`, which is deteremined by version and opcode support.
+Value *TrivialDxilOperation(OP::OpCode Opcode, ArrayRef<Value *> Args, Type *Ty,
+                            Type *RetTy, OP *OP, IRBuilder<> &Builder,
+                            bool SupportsVectors = false) {
+
+  // If supported and the overload type is a vector with more than 1 element,
+  // create a native vector operation.
+  if (SupportsVectors && Ty->isVectorTy() && Ty->getVectorNumElements() > 1) {
+    Function *Func = OP->GetOpFunc(Opcode, Ty);
+    return TrivialDxilVectorOperation(Func, Opcode, Args, Ty, OP, Builder);
+  }
+
+  // Set overload type to the scalar type of `Ty` and generate call(s).
   Type *EltTy = Ty->getScalarType();
-  Function *dxilFunc = hlslOP->GetOpFunc(opcode, EltTy);
+  Function *Func = OP->GetOpFunc(Opcode, EltTy);
 
-  return TrivialDxilOperation(dxilFunc, opcode, refArgs, Ty, RetTy, hlslOP,
-                              Builder);
+  return TrivialDxilOperation(Func, Opcode, Args, Ty, RetTy, OP, Builder);
 }
 
 Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
@@ -484,82 +517,110 @@ Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
   return TrivialDxilOperation(opcode, refArgs, Ty, Inst->getType(), hlslOP, B);
 }
 
-Value *TrivialDxilUnaryOperationRet(OP::OpCode opcode, Value *src, Type *RetTy,
-                                    hlsl::OP *hlslOP, IRBuilder<> &Builder) {
-  Type *Ty = src->getType();
+// Translate call that converts to a dxil unary operation with a different
+// return type from the overload by passing the argument, explicit return type,
+// and helper objects to the scalarizing unary dxil operation creation.
+Value *TrivialUnaryOperationRet(CallInst *CI, IntrinsicOp IOP,
+                                OP::OpCode Opcode,
+                                HLOperationLowerHelper &Helper,
+                                HLObjectOperationLowerHelper *ObjHelper,
+                                bool &Translated) {
+  Value *Src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Type *Ty = Src->getType();
 
-  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, src};
+  IRBuilder<> Builder(CI);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Type *RetTy = CI->getType();
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src};
 
-  return TrivialDxilOperation(opcode, args, Ty, RetTy, hlslOP, Builder);
+  return TrivialDxilOperation(Opcode, Args, Ty, RetTy, OP, Builder);
 }
 
-Value *TrivialDxilUnaryOperation(OP::OpCode opcode, Value *src,
-                                 hlsl::OP *hlslOP, IRBuilder<> &Builder) {
-  return TrivialDxilUnaryOperationRet(opcode, src, src->getType(), hlslOP,
-                                      Builder);
+Value *TrivialDxilUnaryOperation(OP::OpCode Opcode, Value *Src, hlsl::OP *OP,
+                                 IRBuilder<> &Builder,
+                                 bool SupportsVectors = false) {
+  Type *Ty = Src->getType();
+
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src};
+
+  return TrivialDxilOperation(Opcode, Args, Ty, Ty, OP, Builder,
+                              SupportsVectors);
 }
 
-Value *TrivialDxilBinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
-                                  hlsl::OP *hlslOP, IRBuilder<> &Builder) {
-  Type *Ty = src0->getType();
+Value *TrivialDxilBinaryOperation(OP::OpCode Opcode, Value *Src0, Value *Src1,
+                                  hlsl::OP *OP, IRBuilder<> &Builder,
+                                  bool SupportsVectors = false) {
+  Type *Ty = Src0->getType();
 
-  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, src0, src1};
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src0, Src1};
 
-  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder);
+  return TrivialDxilOperation(Opcode, Args, Ty, Ty, OP, Builder,
+                              SupportsVectors);
 }
 
-Value *TrivialDxilTrinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
-                                   Value *src2, hlsl::OP *hlslOP,
-                                   IRBuilder<> &Builder) {
-  Type *Ty = src0->getType();
+Value *TrivialDxilTrinaryOperation(OP::OpCode Opcode, Value *Src0, Value *Src1,
+                                   Value *Src2, hlsl::OP *OP,
+                                   IRBuilder<> &Builder,
+                                   bool SupportsVectors = false) {
+  Type *Ty = Src0->getType();
 
-  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, src0, src1, src2};
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src0, Src1, Src2};
 
-  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder);
+  return TrivialDxilOperation(Opcode, Args, Ty, Ty, OP, Builder,
+                              SupportsVectors);
 }
 
-Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                             HLOperationLowerHelper &helper,
-                             HLObjectOperationLowerHelper *pObjHelper,
+// Translate call that trivially converts to a dxil unary operation by passing
+// argument, return type, and helper objects to either scalarizing or native
+// vector dxil operation creation depending on version and vector size.
+Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                             HLOperationLowerHelper &Helper,
+                             HLObjectOperationLowerHelper *ObjHelper,
                              bool &Translated) {
-  Value *src0 = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Value *Src0 = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
   IRBuilder<> Builder(CI);
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *retVal = TrivialDxilUnaryOperationRet(opcode, src0, CI->getType(),
-                                               hlslOP, Builder);
-  return retVal;
+  hlsl::OP *OP = &Helper.hlslOP;
+
+  return TrivialDxilUnaryOperation(Opcode, Src0, OP, Builder,
+                                   Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
-Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                              HLOperationLowerHelper &helper,
-                              HLObjectOperationLowerHelper *pObjHelper,
+// Translate call that trivially converts to a dxil binary operation by passing
+// arguments, return type, and helper objects to either scalarizing or native
+// vector dxil operation creation depending on version and vector size.
+Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                              HLOperationLowerHelper &Helper,
+                              HLObjectOperationLowerHelper *ObjHelper,
                               bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *src0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
-  Value *src1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Src0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Value *Src1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
   IRBuilder<> Builder(CI);
 
-  Value *binOp =
-      TrivialDxilBinaryOperation(opcode, src0, src1, hlslOP, Builder);
-  return binOp;
+  return TrivialDxilBinaryOperation(Opcode, Src0, Src1, OP, Builder,
+                                    Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
-Value *TrivialTrinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                               HLOperationLowerHelper &helper,
-                               HLObjectOperationLowerHelper *pObjHelper,
+// Translate call that trivially converts to a dxil trinary (aka tertiary)
+// operation by passing arguments, return type, and helper objects to either
+// scalarizing or native vector dxil operation creation depending on version
+// and vector size.
+Value *TrivialTrinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                               HLOperationLowerHelper &Helper,
+                               HLObjectOperationLowerHelper *ObjHelper,
                                bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *src0 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
-  Value *src1 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
-  Value *src2 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Src0 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
+  Value *Src1 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
+  Value *Src2 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
   IRBuilder<> Builder(CI);
 
-  Value *triOp =
-      TrivialDxilTrinaryOperation(opcode, src0, src1, src2, hlslOP, Builder);
-  return triOp;
+  return TrivialDxilTrinaryOperation(Opcode, Src0, Src1, Src2, OP, Builder,
+                                     Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
 Value *TrivialIsSpecialFloat(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -724,48 +785,54 @@ Value *TranslateD3DColorToUByte4(CallInst *CI, IntrinsicOp IOP,
 // | float    | False               |                2 |
 // +----------+---------------------+------------------+
 
-bool CanUseFxcMulOnlyPatternForPow(IRBuilder<> &Builder, Value *x, Value *pow,
-                                   int32_t &powI) {
+bool CanUseFxcMulOnlyPatternForPow(IRBuilder<> &Builder, Value *X, Value *Pow,
+                                   int32_t &PowI) {
   // Applicable only when power is a literal.
-  if (!isa<ConstantDataVector>(pow) && !isa<ConstantFP>(pow)) {
+  if (!isa<ConstantDataVector>(Pow) && !isa<ConstantFP>(Pow)) {
     return false;
   }
 
   // Only apply this code gen on splat values.
-  if (ConstantDataVector *cdv = dyn_cast<ConstantDataVector>(pow)) {
-    if (!hlsl::dxilutil::IsSplat(cdv)) {
+  if (ConstantDataVector *Cdv = dyn_cast<ConstantDataVector>(Pow)) {
+    if (!hlsl::dxilutil::IsSplat(Cdv)) {
       return false;
     }
   }
 
-  APFloat powAPF = isa<ConstantDataVector>(pow)
-                       ? cast<ConstantDataVector>(pow)->getElementAsAPFloat(0)
+  // Only apply on aggregates of 16 or fewer elements,
+  // representing the max 4x4 matrix size.
+  Type *Ty = X->getType();
+  if (Ty->isVectorTy() && Ty->getVectorNumElements() > 16)
+    return false;
+
+  APFloat PowAPF = isa<ConstantDataVector>(Pow)
+                       ? cast<ConstantDataVector>(Pow)->getElementAsAPFloat(0)
                        : // should be a splat value
-                       cast<ConstantFP>(pow)->getValueAPF();
-  APSInt powAPS(32, false);
-  bool isExact = false;
+                       cast<ConstantFP>(Pow)->getValueAPF();
+  APSInt PowAPS(32, false);
+  bool IsExact = false;
   // Try converting float value of power to integer and also check if the float
   // value is exact.
-  APFloat::opStatus status =
-      powAPF.convertToInteger(powAPS, APFloat::rmTowardZero, &isExact);
-  if (status == APFloat::opStatus::opOK && isExact) {
-    powI = powAPS.getExtValue();
-    uint32_t powU = abs(powI);
-    int setBitCount = 0;
-    int maxBitSetPos = -1;
-    for (int i = 0; i < 32; i++) {
-      if ((powU >> i) & 1) {
-        setBitCount++;
-        maxBitSetPos = i;
+  APFloat::opStatus Status =
+      PowAPF.convertToInteger(PowAPS, APFloat::rmTowardZero, &IsExact);
+  if (Status == APFloat::opStatus::opOK && IsExact) {
+    PowI = PowAPS.getExtValue();
+    uint32_t PowU = abs(PowI);
+    int SetBitCount = 0;
+    int MaxBitSetPos = -1;
+    for (int I = 0; I < 32; I++) {
+      if ((PowU >> I) & 1) {
+        SetBitCount++;
+        MaxBitSetPos = I;
       }
     }
 
-    DXASSERT(maxBitSetPos <= 30, "msb should always be zero.");
-    unsigned numElem =
-        isa<ConstantDataVector>(pow) ? x->getType()->getVectorNumElements() : 1;
-    int mulOpThreshold = powI < 0 ? numElem + 1 : 2 * numElem + 1;
-    int mulOpNeeded = maxBitSetPos + setBitCount - 1;
-    return mulOpNeeded <= mulOpThreshold;
+    DXASSERT(MaxBitSetPos <= 30, "msb should always be zero.");
+    unsigned NumElem =
+        isa<ConstantDataVector>(Pow) ? X->getType()->getVectorNumElements() : 1;
+    int MulOpThreshold = PowI < 0 ? NumElem + 1 : 2 * NumElem + 1;
+    int MulOpNeeded = MaxBitSetPos + SetBitCount - 1;
+    return MulOpNeeded <= MulOpThreshold;
   }
 
   return false;
@@ -1447,6 +1514,7 @@ Value *TranslateWaveA2B(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *refArgs[] = {nullptr, CI->getOperand(1)};
   return TrivialDxilOperation(opcode, refArgs, helper.voidTy, CI, hlslOP);
 }
+
 // Wave ballot intrinsic.
 Value *TranslateWaveBallot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                            HLOperationLowerHelper &helper,
@@ -1899,9 +1967,11 @@ Value *TranslateClamp(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
   IRBuilder<> Builder(CI);
   // min(max(x, minVal), maxVal).
-  Value *maxXMinVal =
-      TrivialDxilBinaryOperation(maxOp, x, minVal, hlslOP, Builder);
-  return TrivialDxilBinaryOperation(minOp, maxXMinVal, maxVal, hlslOP, Builder);
+  bool SupportsVectors = helper.M.GetShaderModel()->IsSM69Plus();
+  Value *maxXMinVal = TrivialDxilBinaryOperation(maxOp, x, minVal, hlslOP,
+                                                 Builder, SupportsVectors);
+  return TrivialDxilBinaryOperation(minOp, maxXMinVal, maxVal, hlslOP, Builder,
+                                    SupportsVectors);
 }
 
 Value *TranslateClip(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2014,46 +2084,45 @@ Value *TranslateDst(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return Result;
 }
 
-Value *TranslateFirstbitHi(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                           HLOperationLowerHelper &helper,
-                           HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateFirstbitHi(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                           HLOperationLowerHelper &Helper,
+                           HLObjectOperationLowerHelper *ObjHelper,
                            bool &Translated) {
-  Value *firstbitHi =
-      TrivialUnaryOperation(CI, IOP, opcode, helper, pObjHelper, Translated);
+  Value *FirstbitHi =
+      TrivialUnaryOperationRet(CI, IOP, Opcode, Helper, ObjHelper, Translated);
   // firstbitHi == -1? -1 : (bitWidth-1 -firstbitHi);
   IRBuilder<> Builder(CI);
-  Constant *neg1 = Builder.getInt32(-1);
-  Value *src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Neg1 = Builder.getInt32(-1);
+  Value *Src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
 
-  Type *Ty = src->getType();
+  Type *Ty = Src->getType();
   IntegerType *EltTy = cast<IntegerType>(Ty->getScalarType());
-  Constant *bitWidth = Builder.getInt32(EltTy->getBitWidth() - 1);
+  Constant *BitWidth = Builder.getInt32(EltTy->getBitWidth() - 1);
 
   if (Ty == Ty->getScalarType()) {
-    Value *sub = Builder.CreateSub(bitWidth, firstbitHi);
-    Value *cond = Builder.CreateICmpEQ(neg1, firstbitHi);
-    return Builder.CreateSelect(cond, neg1, sub);
+    Value *Sub = Builder.CreateSub(BitWidth, FirstbitHi);
+    Value *Cond = Builder.CreateICmpEQ(Neg1, FirstbitHi);
+    return Builder.CreateSelect(Cond, Neg1, Sub);
   } else {
-    Value *result = UndefValue::get(CI->getType());
-    unsigned vecSize = Ty->getVectorNumElements();
-    for (unsigned i = 0; i < vecSize; i++) {
-      Value *EltFirstBit = Builder.CreateExtractElement(firstbitHi, i);
-      Value *sub = Builder.CreateSub(bitWidth, EltFirstBit);
-      Value *cond = Builder.CreateICmpEQ(neg1, EltFirstBit);
-      Value *Elt = Builder.CreateSelect(cond, neg1, sub);
-      result = Builder.CreateInsertElement(result, Elt, i);
+    Value *Result = UndefValue::get(CI->getType());
+    unsigned VecSize = Ty->getVectorNumElements();
+    for (unsigned I = 0; I < VecSize; I++) {
+      Value *EltFirstBit = Builder.CreateExtractElement(FirstbitHi, I);
+      Value *Sub = Builder.CreateSub(BitWidth, EltFirstBit);
+      Value *Cond = Builder.CreateICmpEQ(Neg1, EltFirstBit);
+      Value *Elt = Builder.CreateSelect(Cond, Neg1, Sub);
+      Result = Builder.CreateInsertElement(Result, Elt, I);
     }
-    return result;
+    return Result;
   }
 }
 
-Value *TranslateFirstbitLo(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                           HLOperationLowerHelper &helper,
-                           HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateFirstbitLo(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                           HLOperationLowerHelper &Helper,
+                           HLObjectOperationLowerHelper *ObjHelper,
                            bool &Translated) {
-  Value *firstbitLo =
-      TrivialUnaryOperation(CI, IOP, opcode, helper, pObjHelper, Translated);
-  return firstbitLo;
+  return TrivialUnaryOperationRet(CI, IOP, Opcode, Helper, ObjHelper,
+                                  Translated);
 }
 
 Value *TranslateLit(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2200,57 +2269,60 @@ Value *TranslateDistance(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return TranslateLength(CI, sub, hlslOP);
 }
 
-Value *TranslateExp(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
-                    bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+Value *TranslateExp(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
+  hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
-  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
-  Constant *log2eConst = ConstantFP::get(Ty->getScalarType(), M_LOG2E);
-  if (Ty != Ty->getScalarType()) {
-    log2eConst =
-        ConstantVector::getSplat(Ty->getVectorNumElements(), log2eConst);
-  }
-  val = Builder.CreateFMul(log2eConst, val);
-  Value *exp = TrivialDxilUnaryOperation(OP::OpCode::Exp, val, hlslOP, Builder);
-  return exp;
+  Value *Val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Log2eConst = ConstantFP::get(Ty->getScalarType(), M_LOG2E);
+  if (Ty != Ty->getScalarType())
+    Log2eConst =
+        ConstantVector::getSplat(Ty->getVectorNumElements(), Log2eConst);
+  Val = Builder.CreateFMul(Log2eConst, Val);
+
+  return TrivialDxilUnaryOperation(OP::OpCode::Exp, Val, OP, Builder,
+                                   Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
-Value *TranslateLog(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
-                    bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+Value *TranslateLog(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
+  hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
-  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
-  Constant *ln2Const = ConstantFP::get(Ty->getScalarType(), M_LN2);
-  if (Ty != Ty->getScalarType()) {
-    ln2Const = ConstantVector::getSplat(Ty->getVectorNumElements(), ln2Const);
-  }
-  Value *log = TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder);
+  Value *Val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Ln2Const = ConstantFP::get(Ty->getScalarType(), M_LN2);
+  if (Ty != Ty->getScalarType())
+    Ln2Const = ConstantVector::getSplat(Ty->getVectorNumElements(), Ln2Const);
+
+  Value *log =
+      TrivialDxilUnaryOperation(OP::OpCode::Log, Val, OP, Builder,
+                                Helper.M.GetShaderModel()->IsSM69Plus());
 
-  return Builder.CreateFMul(ln2Const, log);
+  return Builder.CreateFMul(Ln2Const, log);
 }
 
-Value *TranslateLog10(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                      HLOperationLowerHelper &helper,
-                      HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateLog10(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                      HLOperationLowerHelper &Helper,
+                      HLObjectOperationLowerHelper *ObjHelper,
                       bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+  hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
-  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
-  Constant *log2_10Const = ConstantFP::get(Ty->getScalarType(), M_LN2 / M_LN10);
+  Value *Val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Log2to10Const =
+      ConstantFP::get(Ty->getScalarType(), M_LN2 / M_LN10);
   if (Ty != Ty->getScalarType()) {
-    log2_10Const =
-        ConstantVector::getSplat(Ty->getVectorNumElements(), log2_10Const);
+    Log2to10Const =
+        ConstantVector::getSplat(Ty->getVectorNumElements(), Log2to10Const);
   }
-  Value *log = TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder);
+  Value *Log =
+      TrivialDxilUnaryOperation(OP::OpCode::Log, Val, OP, Builder,
+                                Helper.M.GetShaderModel()->IsSM69Plus());
 
-  return Builder.CreateFMul(log2_10Const, log);
+  return Builder.CreateFMul(Log2to10Const, Log);
 }
 
 Value *TranslateFMod(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2431,18 +2503,23 @@ Value *TrivialDotOperation(OP::OpCode opcode, Value *src0, Value *src1,
   return dotOP;
 }
 
-Value *TranslateIDot(Value *arg0, Value *arg1, unsigned vecSize,
-                     hlsl::OP *hlslOP, IRBuilder<> &Builder,
-                     bool Unsigned = false) {
-  auto madOpCode = Unsigned ? DXIL::OpCode::UMad : DXIL::OpCode::IMad;
-  Value *Elt0 = Builder.CreateExtractElement(arg0, (uint64_t)0);
-  Value *Elt1 = Builder.CreateExtractElement(arg1, (uint64_t)0);
-  Value *Result = Builder.CreateMul(Elt0, Elt1);
-  for (unsigned iVecElt = 1; iVecElt < vecSize; ++iVecElt) {
-    Elt0 = Builder.CreateExtractElement(arg0, iVecElt);
-    Elt1 = Builder.CreateExtractElement(arg1, iVecElt);
-    Result = TrivialDxilTrinaryOperation(madOpCode, Elt0, Elt1, Result, hlslOP,
-                                         Builder);
+// Instead of using a DXIL intrinsic, implement a dot product operation using
+// multiply and add operations. Used for integer dots and long vectors.
+Value *ExpandDot(Value *Arg0, Value *Arg1, unsigned VecSize, hlsl::OP *OP,
+                 IRBuilder<> &Builder,
+                 DXIL::OpCode MadOpCode = DXIL::OpCode::IMad) {
+  Value *Elt0 = Builder.CreateExtractElement(Arg0, (uint64_t)0);
+  Value *Elt1 = Builder.CreateExtractElement(Arg1, (uint64_t)0);
+  Value *Result;
+  if (Elt0->getType()->isFloatingPointTy())
+    Result = Builder.CreateFMul(Elt0, Elt1);
+  else
+    Result = Builder.CreateMul(Elt0, Elt1);
+  for (unsigned Elt = 1; Elt < VecSize; ++Elt) {
+    Elt0 = Builder.CreateExtractElement(Arg0, Elt);
+    Elt1 = Builder.CreateExtractElement(Arg1, Elt);
+    Result =
+        TrivialDxilTrinaryOperation(MadOpCode, Elt0, Elt1, Result, OP, Builder);
   }
 
   return Result;
@@ -2470,21 +2547,25 @@ Value *TranslateFDot(Value *arg0, Value *arg1, unsigned vecSize,
   }
 }
 
-Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
-                    bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
-  Type *Ty = arg0->getType();
-  unsigned vecSize = Ty->getVectorNumElements();
-  Value *arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Type *Ty = Arg0->getType();
+  unsigned VecSize = Ty->getVectorNumElements();
+  Value *Arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
   IRBuilder<> Builder(CI);
-  if (Ty->getScalarType()->isFloatingPointTy()) {
-    return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
+  Type *EltTy = Ty->getScalarType();
+  if (EltTy->isFloatingPointTy() && Ty->getVectorNumElements() <= 4) {
+    return TranslateFDot(Arg0, Arg1, VecSize, OP, Builder);
   } else {
-    return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
-                         IOP == IntrinsicOp::IOP_udot);
+    DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
+    if (IOP == IntrinsicOp::IOP_udot)
+      MadOpCode = DXIL::OpCode::UMad;
+    else if (EltTy->isFloatingPointTy())
+      MadOpCode = DXIL::OpCode::FMad;
+    return ExpandDot(Arg0, Arg1, VecSize, OP, Builder, MadOpCode);
   }
 }
 
@@ -2587,31 +2668,32 @@ Value *TranslateRefract(CallInst *CI, IntrinsicOp IOP, OP::OpCode op,
   return refract;
 }
 
-Value *TranslateSmoothStep(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                           HLOperationLowerHelper &helper,
-                           HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateSmoothStep(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                           HLOperationLowerHelper &Helper,
+                           HLObjectOperationLowerHelper *ObjHelper,
                            bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+  hlsl::OP *OP = &Helper.hlslOP;
   // s = saturate((x-min)/(max-min)).
   IRBuilder<> Builder(CI);
-  Value *minVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMinIdx);
-  Value *maxVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMaxIdx);
-  Value *maxSubMin = Builder.CreateFSub(maxVal, minVal);
-  Value *x = CI->getArgOperand(HLOperandIndex::kSmoothStepOpXIdx);
-  Value *xSubMin = Builder.CreateFSub(x, minVal);
-  Value *satVal = Builder.CreateFDiv(xSubMin, maxSubMin);
-
-  Value *s = TrivialDxilUnaryOperation(DXIL::OpCode::Saturate, satVal, hlslOP,
-                                       Builder);
+  Value *MinVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMinIdx);
+  Value *MaxVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMaxIdx);
+  Value *MaxSubMin = Builder.CreateFSub(MaxVal, MinVal);
+  Value *X = CI->getArgOperand(HLOperandIndex::kSmoothStepOpXIdx);
+  Value *XSubMin = Builder.CreateFSub(X, MinVal);
+  Value *SatVal = Builder.CreateFDiv(XSubMin, MaxSubMin);
+
+  Value *S =
+      TrivialDxilUnaryOperation(DXIL::OpCode::Saturate, SatVal, OP, Builder,
+                                Helper.M.GetShaderModel()->IsSM69Plus());
   // return s * s *(3-2*s).
-  Constant *c2 = ConstantFP::get(CI->getType(), 2);
-  Constant *c3 = ConstantFP::get(CI->getType(), 3);
+  Constant *C2 = ConstantFP::get(CI->getType(), 2);
+  Constant *C3 = ConstantFP::get(CI->getType(), 3);
 
-  Value *sMul2 = Builder.CreateFMul(s, c2);
-  Value *result = Builder.CreateFSub(c3, sMul2);
-  result = Builder.CreateFMul(s, result);
-  result = Builder.CreateFMul(s, result);
-  return result;
+  Value *SMul2 = Builder.CreateFMul(S, C2);
+  Value *Result = Builder.CreateFSub(C3, SMul2);
+  Result = Builder.CreateFMul(S, Result);
+  Result = Builder.CreateFMul(S, Result);
+  return Result;
 }
 
 Value *TranslateMSad4(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -3013,45 +3095,46 @@ Value *SplatToVector(Value *Elt, Type *DstTy, IRBuilder<> &Builder) {
   return Result;
 }
 
-Value *TranslateMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
-                    bool &Translated) {
+Value *TranslateMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
 
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
-  Value *arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *arg0Ty = arg0->getType();
-  Type *arg1Ty = arg1->getType();
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Value *Arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+  Type *Arg0Ty = Arg0->getType();
+  Type *Arg1Ty = Arg1->getType();
   IRBuilder<> Builder(CI);
 
-  if (arg0Ty->isVectorTy()) {
-    if (arg1Ty->isVectorTy()) {
+  if (Arg0Ty->isVectorTy()) {
+    if (Arg1Ty->isVectorTy()) {
       // mul(vector, vector) == dot(vector, vector)
-      unsigned vecSize = arg0Ty->getVectorNumElements();
-      if (arg0Ty->getScalarType()->isFloatingPointTy()) {
-        return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
+      unsigned VecSize = Arg0Ty->getVectorNumElements();
+      if (Arg0Ty->getScalarType()->isFloatingPointTy()) {
+        return TranslateFDot(Arg0, Arg1, VecSize, OP, Builder);
       } else {
-        return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
-                             IOP == IntrinsicOp::IOP_umul);
+        DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
+        if (IOP == IntrinsicOp::IOP_umul)
+          MadOpCode = DXIL::OpCode::UMad;
+        return ExpandDot(Arg0, Arg1, VecSize, OP, Builder, MadOpCode);
       }
     } else {
       // mul(vector, scalar) == vector * scalar-splat
-      arg1 = SplatToVector(arg1, arg0Ty, Builder);
+      Arg1 = SplatToVector(Arg1, Arg0Ty, Builder);
     }
   } else {
-    if (arg1Ty->isVectorTy()) {
+    if (Arg1Ty->isVectorTy()) {
       // mul(scalar, vector) == scalar-splat * vector
-      arg0 = SplatToVector(arg0, arg1Ty, Builder);
+      Arg0 = SplatToVector(Arg0, Arg1Ty, Builder);
     }
     // else mul(scalar, scalar) == scalar * scalar;
   }
 
   // create fmul/mul for the pair of vectors or scalars
-  if (arg0Ty->getScalarType()->isFloatingPointTy()) {
-    return Builder.CreateFMul(arg0, arg1);
+  if (Arg0Ty->getScalarType()->isFloatingPointTy()) {
+    return Builder.CreateFMul(Arg0, Arg1);
   } else {
-    return Builder.CreateMul(arg0, arg1);
+    return Builder.CreateMul(Arg0, Arg1);
   }
 }
 
@@ -6150,20 +6233,8 @@ Value *TranslateAnd(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                     bool &Translated) {
   Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltX = Builder.CreateExtractElement(x, i);
-      Value *EltY = Builder.CreateExtractElement(y, i);
-      Value *tmp = Builder.CreateAnd(EltX, EltY);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateAnd(x, y);
 }
 Value *TranslateOr(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -6171,20 +6242,8 @@ Value *TranslateOr(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                    HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
   Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltX = Builder.CreateExtractElement(x, i);
-      Value *EltY = Builder.CreateExtractElement(y, i);
-      Value *tmp = Builder.CreateOr(EltX, EltY);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateOr(x, y);
 }
 Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -6194,21 +6253,8 @@ Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *cond = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
   Value *t = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
   Value *f = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltCond = Builder.CreateExtractElement(cond, i);
-      Value *EltTrue = Builder.CreateExtractElement(t, i);
-      Value *EltFalse = Builder.CreateExtractElement(f, i);
-      Value *tmp = Builder.CreateSelect(EltCond, EltTrue, EltFalse);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateSelect(cond, t, f);
 }
 } // namespace
@@ -6467,18 +6513,20 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP_clip, TranslateClip, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_cos, TrivialUnaryOperation, DXIL::OpCode::Cos},
     {IntrinsicOp::IOP_cosh, TrivialUnaryOperation, DXIL::OpCode::Hcos},
-    {IntrinsicOp::IOP_countbits, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_countbits, TrivialUnaryOperationRet,
      DXIL::OpCode::Countbits},
     {IntrinsicOp::IOP_cross, TranslateCross, DXIL::OpCode::NumOpCodes},
-    {IntrinsicOp::IOP_ddx, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddx, TrivialUnaryOperationRet,
+     DXIL::OpCode::DerivCoarseX},
+    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivFineX},
-    {IntrinsicOp::IOP_ddy, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddy, TrivialUnaryOperationRet,
+     DXIL::OpCode::DerivCoarseY},
+    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivFineY},
     {IntrinsicOp::IOP_degrees, TranslateDegrees, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_determinant, EmptyLower, DXIL::OpCode::NumOpCodes},
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 027d7d3cbc..3dac550218 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -6606,7 +6606,7 @@ bool HLSLExternalSource::MatchArguments(
   argTypes.clear();
   const bool isVariadic = IsVariadicIntrinsicFunction(pIntrinsic);
 
-  static const UINT UnusedSize = 0xFF;
+  static const UINT UnusedSize = UINT_MAX;
   static const BYTE MaxIntrinsicArgs = g_MaxIntrinsicParamCount + 1;
 #define CAB(cond, arg)                                                         \
   {                                                                            \
@@ -6622,7 +6622,7 @@ bool HLSLExternalSource::MatchArguments(
   ArBasicKind
       ComponentType[MaxIntrinsicArgs]; // Component type for each argument,
                                        // AR_BASIC_UNKNOWN if unspecified.
-  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UNUSED_INDEX32
+  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UnusedSize
                                        // if unspecified.
   badArgIdx = MaxIntrinsicArgs;
 
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
new file mode 100644
index 0000000000..af6f96745c
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
@@ -0,0 +1,391 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=7   %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=125 %s | FileCheck %s
+
+// Test vector-enabled non-trivial intrinsics that take parameters of various types.
+
+RWByteAddressBuffer buf;
+RWByteAddressBuffer ibuf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle {{%.*}}, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+  vector<float16_t, NUM> hVec3 = buf.Load<vector<float16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+  vector<float, NUM> fVec3 = buf.Load<vector<float, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+  vector<double, NUM> dVec3 = buf.Load<vector<double, NUM> >(5120);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle {{%.*}}, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[svec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = ibuf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = ibuf.Load<vector<int16_t, NUM> >(512);
+  vector<int16_t, NUM> sVec3 = ibuf.Load<vector<int16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1025
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[usvec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = ibuf.Load<vector<uint16_t, NUM> >(1025);
+  vector<uint16_t, NUM> usVec2 = ibuf.Load<vector<uint16_t, NUM> >(1536);
+  vector<uint16_t, NUM> usVec3 = ibuf.Load<vector<uint16_t, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2049
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[ivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = ibuf.Load<vector<int, NUM> >(2049);
+  vector<int, NUM> iVec2 = ibuf.Load<vector<int, NUM> >(2560);
+  vector<int, NUM> iVec3 = ibuf.Load<vector<int, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3073
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[uivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = ibuf.Load<vector<uint, NUM> >(3073);
+  vector<uint, NUM> uiVec2 = ibuf.Load<vector<uint, NUM> >(3584);
+  vector<uint, NUM> uiVec3 = ibuf.Load<vector<uint, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4097
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[lvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = ibuf.Load<vector<int64_t, NUM> >(4097);
+  vector<int64_t, NUM> lVec2 = ibuf.Load<vector<int64_t, NUM> >(4608);
+  vector<int64_t, NUM> lVec3 = ibuf.Load<vector<int64_t, NUM> >(5120);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5121
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 6144
+  // CHECK: [[ulvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = ibuf.Load<vector<uint64_t, NUM> >(5121);
+  vector<uint64_t, NUM> ulVec2 = ibuf.Load<vector<uint64_t, NUM> >(5632);
+  vector<uint64_t, NUM> ulVec3 = ibuf.Load<vector<uint64_t, NUM> >(6144);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 35, <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 36, <[[NUM]] x half> [[tmp]], <[[NUM]] x half> [[hvec3]])  ; FMin(a,b)
+  vector<float16_t, NUM> hRes = clamp(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 35, <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 36, <[[NUM]] x float> [[tmp]], <[[NUM]] x float> [[fvec3]])  ; FMin(a,b)
+  vector<float, NUM> fRes = clamp(fVec1, fVec2, fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 35, <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 36, <[[NUM]] x double> [[tmp]], <[[NUM]] x double> [[dvec3]])  ; FMin(a,b)
+  vector<double, NUM> dRes = clamp(dVec1, dVec2, dVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 37, <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 38, <[[NUM]] x i16> [[tmp]], <[[NUM]] x i16> [[svec3]])  ; IMin(a,b)
+  vector<int16_t, NUM> sRes = clamp(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 39, <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 40, <[[NUM]] x i16> [[tmp]], <[[NUM]] x i16> [[usvec3]])  ; UMin(a,b)
+  vector<uint16_t, NUM> usRes = clamp(usVec1, usVec2, usVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 37, <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 38, <[[NUM]] x i32> [[tmp]], <[[NUM]] x i32> [[ivec3]])  ; IMin(a,b)
+  vector<int, NUM> iRes = clamp(iVec1, iVec2, iVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 39, <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 40, <[[NUM]] x i32> [[tmp]], <[[NUM]] x i32> [[uivec3]])  ; UMin(a,b)
+  vector<uint, NUM> uiRes = clamp(uiVec1, uiVec2, uiVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 37, <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 38, <[[NUM]] x i64> [[tmp]], <[[NUM]] x i64> [[lvec3]])  ; IMin(a,b)
+  vector<int64_t, NUM> lRes = clamp(lVec1, lVec2, lVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 39, <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 40, <[[NUM]] x i64> [[tmp]], <[[NUM]] x i64> [[ulvec3]])  ; UMin(a,b)
+  vector<uint64_t, NUM> ulRes = clamp(ulVec1, ulVec2, ulVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <[[NUM]] x half> [[hvec2]], [[hvec1]]
+  // CHECK: select <[[NUM]] x i1> [[tmp]], <[[NUM]] x half> zeroinitializer, <[[NUM]] x half> <half 0xH3C00
+  hRes += step(hVec1, hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <[[NUM]] x float> [[fvec2]], [[fvec1]]
+  // CHECK: select <[[NUM]] x i1> [[tmp]], <[[NUM]] x float> zeroinitializer, <[[NUM]] x float> <float 1
+  fRes += step(fVec1, fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fmul fast <[[NUM]] x half> [[hvec1]], <half 0x
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 21, <[[NUM]] x half> [[tmp]])  ; Exp(value)
+  hRes += exp(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fmul fast <[[NUM]] x float> [[fvec1]], <float 0x
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 21, <[[NUM]] x float> [[tmp]])  ; Exp(value)
+  fRes += exp(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 23, <[[NUM]] x half> [[hvec1]])  ; Log(value)
+  // CHECK: fmul fast <[[NUM]] x half> [[tmp]], <half 0xH398C
+  hRes += log(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 23, <[[NUM]] x float> [[fvec1]])  ; Log(value)
+  // CHECK: fmul fast <[[NUM]] x float> [[tmp]], <float 0x3FE62E4300000000
+  fRes += log(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> [[hvec2]], [[hvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <[[NUM]] x half> [[hvec3]], [[hvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <[[NUM]] x half> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 7, <[[NUM]] x half> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x half> [[sat]], <half 0xH4000,
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> <half 0xH4200, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x half> [[sat]], [[sat]]
+  // CHECK: fmul fast <[[NUM]] x half> [[mul]], [[sub]]
+  hRes += smoothstep(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x float> [[fvec2]], [[fvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <[[NUM]] x float> [[fvec3]], [[fvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <[[NUM]] x float> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 7, <[[NUM]] x float> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x float> [[sat]], <float 2.000000e+00,
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x float> <float 3.000000e+00, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x float> [[sat]], [[sat]]
+  // CHECK: fmul fast <[[NUM]] x float> [[mul]], [[sub]]
+  fRes += smoothstep(fVec1, fVec2, fVec3);
+
+  // Intrinsics that expand into llvm ops.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x half> [[hvec2]], <half 0xH5329
+  hRes += degrees(hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x float> [[fvec2]], <float 0x404CA5DC20000000
+  fRes += degrees(fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x half> [[hvec3]], <half 0xH2478
+  hRes += radians(hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x float> [[fvec3]], <float 0x3F91DF46A0000000
+  fRes += radians(fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = fcmp fast une <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[f2i:%.*]] = bitcast <[[NUM]] x float> [[fvec1]] to <[[NUM]] x i32>
+  // CHECK: [[and:%.*]] = and <[[NUM]] x i32> [[f2i]], <i32 2139095040
+  // CHECK: [[add:%.*]] = add nsw <[[NUM]] x i32> [[and]], <i32 -1056964608
+  // CHECK: [[shr:%.*]] = ashr <[[NUM]] x i32> [[add]], <i32 23
+  // CHECK: [[i2f:%.*]] = sitofp <[[NUM]] x i32> [[shr]] to <[[NUM]] x float>
+  // CHECK: [[sel:%.*]] = select <[[NUM]] x i1> [[cmp]], <[[NUM]] x float> [[i2f]], <[[NUM]] x float> zeroinitializer
+  // CHECK: [[and:%.*]] = and <[[NUM]] x i32> [[f2i]], <i32 8388607
+  // CHECK: or <[[NUM]] x i32> [[and]], <i32 1056964608
+  vector<float, NUM> exp = fVec3;
+  fRes += frexp(fVec1, exp);
+  fRes += exp;
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fsub fast <[[NUM]] x half> [[hvec3]], [[hvec2]]
+  // CHECK: fmul fast <[[NUM]] x half> [[tmp]], [[hvec1]]
+  hRes += lerp(hVec2, hVec3, hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fsub fast <[[NUM]] x float> [[fvec3]], [[fvec2]]
+  // CHECK: fmul fast <[[NUM]] x float> [[tmp]], [[fvec1]]
+  fRes += lerp(fVec2, fVec3, fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fdiv fast <[[NUM]] x half> <half 0xH3C00, {{.*}}>, [[hvec1]]
+  hRes += rcp(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fdiv fast <[[NUM]] x float> <float 1.000000e+00, {{.*}}>, [[fvec1]]
+  fRes += rcp(fVec1);
+
+  vector<uint, NUM> signs = 1;
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x half> [[hvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x half> [[hvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x double> [[dvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x double> [[dvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(dVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i16> [[usvec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(usVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i32> [[ivec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i32> [[ivec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: [[sub:%.*]] = sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(iVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i32> [[uivec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(uiVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(lVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i64> [[ulvec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(ulVec2);
+
+  iRes += signs;
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[bvec2:%.*]] = icmp ne <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[bvec1:%.*]] = icmp ne <[[NUM]] x i16> [[svec1]], zeroinitializer
+  // CHECK: or <[[NUM]] x i1> [[bvec2]], [[bvec1]]
+  sRes += or(sVec1, sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[bvec3:%.*]] = icmp ne <[[NUM]] x i16> [[svec3]], zeroinitializer
+  // CHECK: and <[[NUM]] x i1> [[bvec3]], [[bvec2]]
+  sRes += and(sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: select <[[NUM]] x i1> [[bvec1]], <[[NUM]] x i16> [[svec2]], <[[NUM]] x i16> [[svec3]]
+  sRes += select(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+
+  ibuf.Store<vector<int16_t, NUM> >(0, sRes);
+  ibuf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  ibuf.Store<vector<int, NUM> >(2048, iRes);
+  ibuf.Store<vector<uint, NUM> >(3072, uiRes);
+  ibuf.Store<vector<int64_t, NUM> >(4096, lRes);
+  ibuf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
new file mode 100644
index 0000000000..7d5da99e21
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
@@ -0,0 +1,147 @@
+// RUN: %dxc -T ps_6_9 %s | FileCheck %s
+
+// Long vector tests for vec ops that scalarize to something more complex
+//  than a simple repetition of the same dx.op calls.
+
+StructuredBuffer< vector<float, 8> > buf;
+ByteAddressBuffer rbuf;
+
+float4 main(uint i : SV_PrimitiveID, bool b : B) : SV_Target {
+  vector<float, 8> vec1 = rbuf.Load< vector<float, 8> >(i++*32);
+  vector<float, 8> vec2 = rbuf.Load< vector<float, 8> >(i++*32);
+  vector<float, 8> vec3 = rbuf.Load< vector<float, 8> >(i++*32);
+
+  // CHECK: fdiv fast <8 x float>
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
+  // CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
+  // CHECK: fcmp fast olt <8 x float>
+  // CHECK: fcmp fast oeq <8 x float>
+  // CHECK: fcmp fast oge <8 x float>
+  // CHECK: fcmp fast olt <8 x float>
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
+  vec1 = atan2(vec1, vec2);
+
+
+  // CHECK: fdiv fast <8 x float>
+  // CHECK: fsub fast <8 x float> <float
+  // CHECK: fcmp fast oge <8 x float>
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+
+  // CHECK: fsub fast <8 x float> <float
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+  // CHECK: fmul fast <8 x float>
+  vec1 = fmod(vec1, vec2);
+
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: fmul fast <8 x float>
+  vec1 = ldexp(vec1, vec2);
+
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: fmul fast <8 x float>
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  vec1 = pow(vec1, vec2);
+
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: fsub fast <8 x float>
+  vec1 = modf(vec1, vec2);
+
+  // CHECK: [[el:%.*]] = extractelement <8 x float>
+  // CHECK: [[mul:%.*]] = fmul fast float [[el]]
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mul]]) ; FMad(a,b,c)
+  // CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+  // CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+  // CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+  vec1 = dot(vec1, vec2);
+
+  vector<bool, 8> bvec = b;
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  bvec &= any(vec1);
+
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  bvec &= all(vec2);
+
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  return WaveMatch(bvec);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..02cad5b894
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl
@@ -0,0 +1,69 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=35 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=35 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=36 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=36 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled binary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]])
+  vector<float16_t, NUM> hRes = FUNC(hVec1, hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]])
+  vector<float, NUM> fRes = FUNC(fVec1, fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 [[OP]], <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]])
+  vector<double, NUM> dRes = FUNC(dVec1, dVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..994246b753
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl
@@ -0,0 +1,116 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=37 -DUOP=39 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=37 -DUOP=39 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=38 -DUOP=40 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=38 -DUOP=40 -DNUM=1022 %s | FileCheck %s
+
+#ifndef UOP
+#define UOP OP
+#endif
+
+// Test vector-enabled binary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode numbers.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 888, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(888, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[UOP:[0-9]*]]
+  buf.Store(999, UOP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = buf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = buf.Load<vector<int16_t, NUM> >(512);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = buf.Load<vector<uint16_t, NUM> >(1024);
+  vector<uint16_t, NUM> usVec2 = buf.Load<vector<uint16_t, NUM> >(1536);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = buf.Load<vector<int, NUM> >(2048);
+  vector<int, NUM> iVec2 = buf.Load<vector<int, NUM> >(2560);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = buf.Load<vector<uint, NUM> >(3072);
+  vector<uint, NUM> uiVec2 = buf.Load<vector<uint, NUM> >(3584);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = buf.Load<vector<int64_t, NUM> >(4096);
+  vector<int64_t, NUM> lVec2 = buf.Load<vector<int64_t, NUM> >(4608);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = buf.Load<vector<uint64_t, NUM> >(5120);
+  vector<uint64_t, NUM> ulVec2 = buf.Load<vector<uint64_t, NUM> >(5632);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]])
+  vector<int16_t, NUM> sRes = FUNC(sVec1, sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[UOP]], <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec1, usVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]])
+  vector<int, NUM> iRes = FUNC(iVec1, iVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[UOP]], <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]])
+  vector<uint, NUM> uiRes = FUNC(uiVec1, uiVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]])
+  vector<int64_t, NUM> lRes = FUNC(lVec1, lVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[UOP]], <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec1, ulVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
new file mode 100644
index 0000000000..40ffd3fe63
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
@@ -0,0 +1,87 @@
+// The binary part of some of these is all just a vector math ops with as many unary dxops as elements.
+// These will have apparent mismatches between the ARITY define and the check prefix.
+
+// RUN: %dxc -DFUNC=abs         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=pow         -DARITY=2 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=f16tof32    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,LEGACY
+// RUN: %dxc -DFUNC=f32tof16    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,LEGACY
+// RUN: %dxc -DFUNC=isfinite    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=isinf       -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=isnan       -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=modf        -DARITY=2 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=countbits   -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=firstbithigh -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=firstbitlow  -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=fwidth      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=QuadReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossX        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossY        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossDiagonal -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=WaveActiveBitAnd       -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveBitOr        -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveBitXor       -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveProduct      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveSum          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveMin          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveMax          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitAnd  -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitOr   -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitXor  -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixProduct -DARITY=5 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixSum     -DARITY=5 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WavePrefixSum          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WavePrefixProduct      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveReadLaneFirst      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveAllEqual     -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+#if ARITY == 1
+#define CALLARGS(x,y,z) x
+#elif ARITY == 2
+#define CALLARGS(x,y,z) x, y
+#elif ARITY == 3
+#define CALLARGS(x,y,z) x, y, z
+// ARITY 4 is used for 1 vec + scalar
+#elif ARITY == 4
+#define CALLARGS(x,y,z) x, i
+// ARITY 5 is used for 1 vec + uint4 mask for wavemultiprefix*
+#elif ARITY == 5
+#define CALLARGS(x,y,z) x, m
+#endif
+
+StructuredBuffer< vector<TYPE, 8> > buf;
+ByteAddressBuffer rbuf;
+
+float4 main(uint i : SV_PrimitiveID, uint4 m : M) : SV_Target {
+  vector<TYPE, 8> arg1 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+  vector<TYPE, 8> arg2 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+  vector<TYPE, 8> arg3 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+
+  // UNARY: call {{.*}} [[DXOP:@dx.op.unary]]
+  // BINARY: call {{.*}} [[DXOP:@dx.op.binary]]
+  // TERTIARY: call {{.*}} [[DXOP:@dx.op.tertiary]]
+  // LEGACY: call {{.*}} [[DXOP:@dx.op.legacy]]
+  // SPECFLT: call {{.*}} [[DXOP:@dx.op.isSpecialFloat]]
+  // QUAD: call {{.*}} [[DXOP:@dx.op.quad]]
+  // WAVE: call {{.*}} [[DXOP:@dx.op.wave]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+
+  vector<TYPE, 8> ret = FUNC(CALLARGS(arg1, arg2, arg3));
+  return float4(ret[0] + ret[1], ret[2] + ret[3], ret[4] + ret[5], ret[6] + ret[7]);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..e32ebc1db2
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl
@@ -0,0 +1,86 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=46 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=46 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled ternary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+// Given that all we have at the moment are fmad and fma and the latter only takes doubles,
+// fma is tacked on as an additional check.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+  vector<float16_t, NUM> hVec3 = buf.Load<vector<float16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+  vector<float, NUM> fVec3 = buf.Load<vector<float, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+  vector<double, NUM> dVec3 = buf.Load<vector<double, NUM> >(5120);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.tertiary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]], <[[NUM]] x half> [[hvec3]])
+  vector<float16_t, NUM> hRes = FUNC(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.tertiary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]], <[[NUM]] x float> [[fvec3]])
+  vector<float, NUM> fRes = FUNC(fVec1, fVec2, fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.tertiary.[[DTY]](i32 [[OP]], <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]], <[[NUM]] x double> [[dvec3]])
+  vector<double, NUM> dRes = FUNC(dVec1, dVec2, dVec3);
+
+  // Tacked on fma() check since it only takes doubles.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.tertiary.[[DTY]](i32 47, <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]], <[[NUM]] x double> [[dvec3]])
+  vector<double, NUM> dRes2 = fma(dVec1, dVec2, dVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+  buf.Store<vector<double, NUM> >(5120, dRes2);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..50f98715e4
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl
@@ -0,0 +1,131 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=48 -DUOP=49 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=48 -DUOP=49 -DNUM=1022 %s | FileCheck %s
+
+#ifndef UOP
+#define UOP OP
+#endif
+
+// Test vector-enabled tertiary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode numbers.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 888, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(888, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[UOP:[0-9]*]]
+  buf.Store(999, UOP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[svec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = buf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = buf.Load<vector<int16_t, NUM> >(512);
+  vector<int16_t, NUM> sVec3 = buf.Load<vector<int16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1025
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[usvec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = buf.Load<vector<uint16_t, NUM> >(1025);
+  vector<uint16_t, NUM> usVec2 = buf.Load<vector<uint16_t, NUM> >(1536);
+  vector<uint16_t, NUM> usVec3 = buf.Load<vector<uint16_t, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2049
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[ivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = buf.Load<vector<int, NUM> >(2049);
+  vector<int, NUM> iVec2 = buf.Load<vector<int, NUM> >(2560);
+  vector<int, NUM> iVec3 = buf.Load<vector<int, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3073
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[uivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = buf.Load<vector<uint, NUM> >(3073);
+  vector<uint, NUM> uiVec2 = buf.Load<vector<uint, NUM> >(3584);
+  vector<uint, NUM> uiVec3 = buf.Load<vector<uint, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4097
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[lvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = buf.Load<vector<int64_t, NUM> >(4097);
+  vector<int64_t, NUM> lVec2 = buf.Load<vector<int64_t, NUM> >(4608);
+  vector<int64_t, NUM> lVec3 = buf.Load<vector<int64_t, NUM> >(5120);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5121
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 6144
+  // CHECK: [[ulvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = buf.Load<vector<uint64_t, NUM> >(5121);
+  vector<uint64_t, NUM> ulVec2 = buf.Load<vector<uint64_t, NUM> >(5632);
+  vector<uint64_t, NUM> ulVec3 = buf.Load<vector<uint64_t, NUM> >(6144);
+
+  // Test simple matching type overloads.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.tertiary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]], <[[NUM]] x i16> [[svec3]])
+  vector<int16_t, NUM> sRes = FUNC(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.tertiary.[[STY]](i32 [[UOP]], <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]], <[[NUM]] x i16> [[usvec3]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec1, usVec2, usVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.tertiary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]], <[[NUM]] x i32> [[ivec3]])
+  vector<int, NUM> iRes = FUNC(iVec1, iVec2, iVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.tertiary.[[ITY]](i32 [[UOP]], <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]], <[[NUM]] x i32> [[uivec3]])
+  vector<uint, NUM> uiRes = FUNC(uiVec1, uiVec2, uiVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.tertiary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]], <[[NUM]] x i64> [[lvec3]])
+  vector<int64_t, NUM> lRes = FUNC(lVec1, lVec2, lVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.tertiary.[[LTY]](i32 [[UOP]], <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]], <[[NUM]] x i64> [[ulvec3]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec1, ulVec2, ulVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..91ab631a7e
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
@@ -0,0 +1,83 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sin  -DOP=13 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sin  -DOP=13 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tan  -DOP=14 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tan  -DOP=14 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=acos -DOP=15 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=acos -DOP=15 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=asin -DOP=16 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=asin -DOP=16 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=atan -DOP=17 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=atan -DOP=17 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cosh -DOP=18 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cosh -DOP=18 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sinh -DOP=19 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sinh -DOP=19 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tanh -DOP=20 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tanh -DOP=20 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=exp2 -DOP=21 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=exp2 -DOP=21 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=frac -DOP=22 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=frac -DOP=22 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log2 -DOP=23 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log2 -DOP=23 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log10 -DOP=23 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log10 -DOP=23 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sqrt -DOP=24 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sqrt -DOP=24 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=rsqrt -DOP=25 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=rsqrt -DOP=25 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=round -DOP=26 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=round -DOP=26 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=floor -DOP=27 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=floor -DOP=27 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ceil -DOP=28 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ceil -DOP=28 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled unary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec = buf.Load<vector<float16_t, NUM> >(0);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[fvec:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec = buf.Load<vector<float, NUM> >(1024);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec]])
+  vector<float16_t, NUM> hRes = FUNC(hVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec]])
+  vector<float, NUM> fRes = FUNC(fVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(1024, fRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..ef0b250745
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl
@@ -0,0 +1,86 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=reversebits   -DOP=30 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=reversebits   -DOP=30 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled unary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // Capture opcode number.
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec = buf.Load<vector<int16_t, NUM> >(0);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[usvec:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec = buf.Load<vector<uint16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[ivec:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec = buf.Load<vector<int, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[uivec:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec = buf.Load<vector<uint, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[lvec:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec = buf.Load<vector<int64_t, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[ulvec:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec = buf.Load<vector<uint64_t, NUM> >(5120);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.unary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec]])
+  vector<int16_t, NUM> sRes = FUNC(sVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.unary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[usvec]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.unary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec]])
+  vector<int, NUM> iRes = FUNC(iVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.unary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[uivec]])
+  vector<uint, NUM> uiRes = FUNC(uiVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.unary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec]])
+  vector<int64_t, NUM> lRes = FUNC(lVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.unary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[ulvec]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
new file mode 100644
index 0000000000..11d705305d
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
@@ -0,0 +1,186 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=13   %s | FileCheck %s
+
+// Source for dxilgen test CodeGenDXIL/passes/longvec-intrinsics.ll.
+// Some targetted filecheck testing as an incidental.
+
+RWStructuredBuffer<vector<float16_t, NUM> > hBuf;
+RWStructuredBuffer<vector<float, NUM> > fBuf;
+RWStructuredBuffer<vector<double, NUM> > dBuf;
+
+RWStructuredBuffer<vector<bool, NUM> > bBuf;
+RWStructuredBuffer<vector<uint, NUM> > uBuf;
+RWStructuredBuffer<vector<int64_t, NUM> > lBuf;
+
+[numthreads(8,1,1)]
+void main() {
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 11, i32 0, i32 4) 
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 12, i32 0, i32 4) 
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 13, i32 0, i32 4) 
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  vector<float, NUM> fVec1 = fBuf[11];
+  vector<float, NUM> fVec2 = fBuf[12];
+  vector<float, NUM> fVec3 = fBuf[13];
+  
+  // CHECK: [[tmp:%.*]] = call <13 x float> @dx.op.binary.v13f32(i32 35, <13 x float> [[fvec1]], <13 x float> [[fvec2]])  ; FMax(a,b)
+  // CHECK: call <13 x float> @dx.op.binary.v13f32(i32 36, <13 x float> [[tmp]], <13 x float> [[fvec3]])  ; FMin(a,b)
+  vector<float, NUM> fRes = clamp(fVec1, fVec2, fVec3);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 14, i32 0, i32 2) 
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 15, i32 0, i32 2) 
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 16, i32 0, i32 2) 
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  vector<float16_t, NUM> hVec1 = hBuf[14];
+  vector<float16_t, NUM> hVec2 = hBuf[15];
+  vector<float16_t, NUM> hVec3 = hBuf[16];
+
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <13 x half> [[hvec2]], [[hvec1]]
+  // CHECK: select <13 x i1> [[tmp]], <13 x half> zeroinitializer, <13 x half> <half 0xH3C00
+  vector<float16_t, NUM> hRes = step(hVec1, hVec2);
+
+  // CHECK: [[tmp:%.*]] = fmul fast <13 x float> [[fvec1]], <float 0x
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 21, <13 x float> [[tmp]])  ; Exp(value)
+  fRes += exp(fVec1);
+
+  // CHECK: [[tmp:%.*]] = call <13 x half> @dx.op.unary.v13f16(i32 23, <13 x half> [[hvec1]])  ; Log(value)
+  // CHECK: fmul fast <13 x half> [[tmp]], <half 0xH398C
+  hRes += log(hVec1);
+
+  // CHECK: [[sub:%.*]] = fsub fast <13 x float> [[fvec2]], [[fvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <13 x float> [[fvec3]], [[fvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <13 x float> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <13 x float> @dx.op.unary.v13f32(i32 7, <13 x float> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <13 x float> [[sat]], <float 2.000000e+00,
+  // CHECK: [[sub:%.*]] = fsub fast <13 x float> <float 3.000000e+00, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <13 x float> [[sat]], [[sat]]
+  // CHECK: fmul fast <13 x float> [[mul]], [[sub]]
+  fRes += smoothstep(fVec1, fVec2, fVec3);
+
+  // Intrinsics that expand into llvm ops.
+
+  // CHECK: fmul fast <13 x float> [[fvec3]], <float 0x3F91DF46A0000000
+  fRes += radians(fVec3);
+
+  // CHECK: [[cmp:%.*]] = fcmp fast une <13 x float> [[fvec1]], zeroinitializer
+  // CHECK: [[f2i:%.*]] = bitcast <13 x float> [[fvec1]] to <13 x i32>
+  // CHECK: [[and:%.*]] = and <13 x i32> [[f2i]], <i32 2139095040
+  // CHECK: [[add:%.*]] = add nsw <13 x i32> [[and]], <i32 -1056964608
+  // CHECK: [[shr:%.*]] = ashr <13 x i32> [[add]], <i32 23
+  // CHECK: [[i2f:%.*]] = sitofp <13 x i32> [[shr]] to <13 x float>
+  // CHECK: [[sel:%.*]] = select <13 x i1> [[cmp]], <13 x float> [[i2f]], <13 x float> zeroinitializer
+  // CHECK: [[and:%.*]] = and <13 x i32> [[f2i]], <i32 8388607
+  // CHECK: or <13 x i32> [[and]], <i32 1056964608
+  vector<float, NUM> exp = fVec3;
+  fRes += frexp(fVec1, exp);
+  fRes += exp;
+
+  // CHECK: [[tmp:%.*]] = fsub fast <13 x half> [[hvec3]], [[hvec2]]
+  // CHECK: fmul fast <13 x half> [[tmp]], [[hvec1]]
+  hRes += lerp(hVec2, hVec3, hVec1);
+
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 17, i32 0, i32 4) 
+  // CHECK: [[uvec1:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 18, i32 0, i32 4) 
+  // CHECK: [[uvec2:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  vector<uint, NUM> uVec1 = uBuf[17];
+  vector<uint, NUM> uVec2 = uBuf[18];
+
+  vector<uint, NUM> signs = 1;
+  // CHECK: [[cmp:%.*]] = icmp ne <13 x i32> [[uvec2]], zeroinitializer
+  // CHECK: zext <13 x i1> [[cmp]] to <13 x i32>
+  signs *= sign(uVec2);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i64 @dx.op.rawBufferVectorLoad.v13i64(i32 303, %dx.types.Handle {{%.*}}, i32 19, i32 0, i32 8) 
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.v13i64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i64 @dx.op.rawBufferVectorLoad.v13i64(i32 303, %dx.types.Handle {{%.*}}, i32 20, i32 0, i32 8) 
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.v13i64 [[ld]], 0
+  vector<int64_t, NUM> lVec1 = lBuf[19];
+  vector<int64_t, NUM> lVec2 = lBuf[20];
+
+  // CHECK: [[gt:%.*]] = icmp sgt <13 x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <13 x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <13 x i1> [[gt]] to <13 x i32>
+  // CHECK: [[ilt:%.*]] = zext <13 x i1> [[lt]] to <13 x i32>
+  // CHECK: sub nsw <13 x i32> [[igt]], [[ilt]]
+  signs *= sign(lVec2);
+
+  vector<uint, NUM> uRes = signs;
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 21, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec1:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 22, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec2:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 23, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec3:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  vector<bool, NUM> bVec1 = bBuf[21];
+  vector<bool, NUM> bVec2 = bBuf[22];
+  vector<bool, NUM> bVec3 = bBuf[23];
+
+  // CHECK: [[bvec2:%.*]] = icmp ne <13 x i32> [[vec2]], zeroinitializer
+  // CHECK: [[bvec1:%.*]] = icmp ne <13 x i32> [[vec1]], zeroinitializer
+  // CHECK: or <13 x i1> [[bvec2]], [[bvec1]]
+  uRes += or(bVec1, bVec2);
+
+  // CHECK: [[bvec3:%.*]] = icmp ne <13 x i32> [[vec3]], zeroinitializer
+  // CHECK: and <13 x i1> [[bvec3]], [[bvec2]]
+  uRes += and(bVec2, bVec3);
+
+  // CHECK: select <13 x i1> [[bvec3]], <13 x i64> [[lvec1]], <13 x i64> [[lvec2]]
+  vector<int64_t, NUM> lRes = select(bVec3, lVec1, lVec2);
+
+  // CHECK: [[el1:%.*]] = extractelement <13 x float> [[fvec1]]
+  // CHECK: [[el2:%.*]] = extractelement <13 x float> [[fvec2]]
+  // CHECK: [[mul:%.*]] = fmul fast float [[el2]], [[el1]]
+  // CHECK: [[mad1:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mul]]) ; FMad(a,b,c)
+  // CHECK: [[mad2:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad1]]) ; FMad(a,b,c)
+  // CHECK: [[mad3:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad2]]) ; FMad(a,b,c)
+  // CHECK: [[mad4:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad3]]) ; FMad(a,b,c)
+  // CHECK: [[mad5:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad4]]) ; FMad(a,b,c)
+  // CHECK: [[mad6:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad5]]) ; FMad(a,b,c)
+  // CHECK: [[mad7:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad6]]) ; FMad(a,b,c)
+  // CHECK: [[mad8:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad7]]) ; FMad(a,b,c)
+  // CHECK: [[mad9:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad8]]) ; FMad(a,b,c)
+  // CHECK: [[mad10:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad9]]) ; FMad(a,b,c)
+  // CHECK: [[mad11:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad10]]) ; FMad(a,b,c)
+  // CHECK: [[mad12:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad11]]) ; FMad(a,b,c)
+  fRes += dot(fVec1, fVec2);
+
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 17, <13 x float> [[fvec1]])  ; Atan(value)
+  fRes += atan(fVec1);
+
+  // CHECK: call <13 x i32> @dx.op.binary.v13i32(i32 40, <13 x i32> [[uvec1]], <13 x i32> [[uvec2]])  ; UMin(a,b)
+  uRes += min(uVec1, uVec2);
+
+  // CHECK: call <13 x float> @dx.op.tertiary.v13f32(i32 46, <13 x float> [[fvec1]], <13 x float> [[fvec2]], <13 x float> [[fvec3]])  ; FMad(a,b,c)
+  fRes += mad(fVec1, fVec2, fVec3);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 24, i32 0, i32 8) 
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 25, i32 0, i32 8) 
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 26, i32 0, i32 8) 
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  vector<double, NUM> dVec1 = dBuf[24];
+  vector<double, NUM> dVec2 = dBuf[25];
+  vector<double, NUM> dVec3 = dBuf[26];
+
+  // CHECK: call <13 x double> @dx.op.tertiary.v13f64(i32 47, <13 x double> [[dvec1]], <13 x double> [[dvec2]], <13 x double> [[dvec3]])
+  vector<double, NUM> dRes = fma(dVec1, dVec2, dVec3);
+
+  hBuf[0] = hRes;
+  fBuf[0] = fRes;
+  dBuf[0] = dRes;
+  uBuf[0] = uRes;
+  lBuf[0] = lRes;
+}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll
new file mode 100644
index 0000000000..8f9dcbbdbc
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll
@@ -0,0 +1,434 @@
+; RUN: %dxopt %s -dxilgen -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%"class.RWStructuredBuffer<vector<half, 7> >" = type { <7 x half> }
+%"class.RWStructuredBuffer<vector<float, 7> >" = type { <7 x float> }
+%"class.RWStructuredBuffer<vector<double, 7> >" = type { <7 x double> }
+%"class.RWStructuredBuffer<vector<bool, 7> >" = type { <7 x i32> }
+%"class.RWStructuredBuffer<vector<unsigned int, 7> >" = type { <7 x i32> }
+%"class.RWStructuredBuffer<vector<long long, 7> >" = type { <7 x i64> }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" = external global %"class.RWStructuredBuffer<vector<half, 7> >", align 2
+@"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" = external global %"class.RWStructuredBuffer<vector<float, 7> >", align 4
+@"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" = external global %"class.RWStructuredBuffer<vector<double, 7> >", align 8
+@"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" = external global %"class.RWStructuredBuffer<vector<bool, 7> >", align 4
+@"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" = external global %"class.RWStructuredBuffer<vector<unsigned int, 7> >", align 4
+@"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" = external global %"class.RWStructuredBuffer<vector<long long, 7> >", align 8
+
+; CHECK-LABEL: define void @main()
+define void @main() #0 {
+bb:
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 11, i32 0, i32 4)
+  ; CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 12, i32 0, i32 4)
+  ; CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 13, i32 0, i32 4)
+  ; CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+
+  %exp = alloca <7 x float>, align 4
+  %tmp = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:23 col:30
+  %tmp1 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp) ; line:23 col:30
+  %tmp2 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp1, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:23 col:30
+  %tmp3 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp2, i32 11) ; line:23 col:30
+  %tmp4 = load <7 x float>, <7 x float>* %tmp3 ; line:23 col:30
+  %tmp5 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:24 col:30
+  %tmp6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp5) ; line:24 col:30
+  %tmp7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:24 col:30
+  %tmp8 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp7, i32 12) ; line:24 col:30
+  %tmp9 = load <7 x float>, <7 x float>* %tmp8 ; line:24 col:30
+  %tmp10 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:25 col:30
+  %tmp11 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp10) ; line:25 col:30
+  %tmp12 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp11, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:25 col:30
+  %tmp13 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp12, i32 13) ; line:25 col:30
+  %tmp14 = load <7 x float>, <7 x float>* %tmp13 ; line:25 col:30
+
+  ;  Clamp operation.
+  ; CHECK: [[max:%.*]] = call <7 x float> @dx.op.binary.v7f32(i32 35, <7 x float> [[fvec1]], <7 x float> [[fvec2]])
+  ; CHECK: call <7 x float> @dx.op.binary.v7f32(i32 36, <7 x float> [[max]], <7 x float> [[fvec3]])
+  %tmp15 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 119, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:29 col:29
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 14, i32 0, i32 2)
+  ; CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 15, i32 0, i32 2)
+  ; CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 16, i32 0, i32 2)
+  ; CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  %tmp16 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:37 col:34
+  %tmp17 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp16) ; line:37 col:34
+  %tmp18 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp17, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:37 col:34
+  %tmp19 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp18, i32 14) ; line:37 col:34
+  %tmp20 = load <7 x half>, <7 x half>* %tmp19 ; line:37 col:34
+  %tmp21 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:38 col:34
+  %tmp22 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp21) ; line:38 col:34
+  %tmp23 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp22, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:38 col:34
+  %tmp24 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp23, i32 15) ; line:38 col:34
+  %tmp25 = load <7 x half>, <7 x half>* %tmp24 ; line:38 col:34
+  %tmp26 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:39 col:34
+  %tmp27 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp26) ; line:39 col:34
+  %tmp28 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp27, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:39 col:34
+  %tmp29 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp28, i32 16) ; line:39 col:34
+  %tmp30 = load <7 x half>, <7 x half>* %tmp29 ; line:39 col:34
+
+  ; Step operation.
+  ; CHECK: [[cmp:%.*]] = fcmp fast olt <7 x half> [[hvec2]], [[hvec1]]
+  ; CHECK: select <7 x i1> [[cmp]], <7 x half> zeroinitializer, <7 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>
+  %tmp31 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>)"(i32 192, <7 x half> %tmp20, <7 x half> %tmp25) ; line:43 col:33
+
+  ;  Exp operation.
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, [[fvec1]]
+  ; CHECK call <7 x float> @dx.op.unary.v7f32(i32 21, <7 x float> [[mul]])
+  %tmp32 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 139, <7 x float> %tmp4) ; line:47 col:11
+  %tmp33 = fadd <7 x float> %tmp15, %tmp32 ; line:47 col:8
+
+  ;  Log operation.
+  ; CHECK: [[log:%.*]] = call <7 x half> @dx.op.unary.v7f16(i32 23, <7 x half> [[hvec1]])
+  ; CHECK: fmul fast <7 x half> <half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C>, [[log]]
+  %tmp34 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>)"(i32 159, <7 x half> %tmp20) ; line:51 col:11
+  %tmp35 = fadd <7 x half> %tmp31, %tmp34 ; line:51 col:8
+
+  ; Smoothstep operation.
+  ; CHECK: [[sub1:%.*]] = fsub fast <7 x float> [[fvec2]], [[fvec1]]
+  ; CHECK: [[sub2:%.*]] = fsub fast <7 x float> [[fvec3]], [[fvec1]]
+  ; CHECK: [[div:%.*]] = fdiv fast <7 x float> [[sub2]], [[sub1]]
+  ; CHECK: [[sat:%.*]] = call <7 x float> @dx.op.unary.v7f32(i32 7, <7 x float> [[div]])
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> [[sat]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  ; CHECK: [[sub:%.*]] = fsub fast <7 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, [[mul]]
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> [[sat]], [[sub]]
+  ; CHECK: fmul fast <7 x float> %Saturate, [[mul]]
+  %tmp36 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 189, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:61 col:11
+  %tmp37 = fadd <7 x float> %tmp33, %tmp36 ; line:61 col:8
+
+  ;  Radians operation.
+  ; CHECK: fmul fast <7 x float> <float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000>, [[fvec3]]
+  %tmp38 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 176, <7 x float> %tmp14) ; line:66 col:11
+  %tmp39 = fadd <7 x float> %tmp37, %tmp38 ; line:66 col:8
+  store <7 x float> %tmp14, <7 x float>* %exp, align 4 ; line:77 col:22
+
+  ;  Frexp operation.
+  ; CHECK: [[cmp:%.*]] = fcmp fast une <7 x float> [[fvec1]], zeroinitializer
+  ; CHECK: [[ext:%.*]] = sext <7 x i1> [[cmp]] to <7 x i32>
+  ; CHECK: [[bct:%.*]] = bitcast <7 x float> [[fvec1]] to <7 x i32>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[bct]], <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  ; CHECK: [[add:%.*]] = add <7 x i32> [[and]], <i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[add]], [[ext]]
+  ; CHECK: [[shr:%.*]] = ashr <7 x i32> [[and]], <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  ; CHECK: [[i2f:%.*]] = sitofp <7 x i32> [[shr]] to <7 x float>
+  ; CHECK: store <7 x float> [[i2f]], <7 x float>* %exp
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[bct]], <i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  ; CHECK: [[or:%.*]] = or <7 x i32> [[and]], <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[or]], [[ext]]
+  ; CHECK: bitcast <7 x i32> [[and]] to <7 x float>
+  %tmp41 = call <7 x float> @"dx.hl.op..<7 x float> (i32, <7 x float>, <7 x float>*)"(i32 150, <7 x float> %tmp4, <7 x float>* %exp) ; line:78 col:11
+  %tmp42 = fadd <7 x float> %tmp39, %tmp41 ; line:78 col:8
+  %tmp43 = load <7 x float>, <7 x float>* %exp, align 4 ; line:79 col:11
+  %tmp44 = fadd <7 x float> %tmp42, %tmp43 ; line:79 col:8
+
+  ;  Lerp operation.
+  ; CHECK: [[sub:%.*]] = fsub fast <7 x half> [[hvec3]], [[hvec2]]
+  ; CHECK: fmul fast <7 x half> [[hvec1]], [[sub]]
+  %tmp45 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>, <7 x half>)"(i32 157, <7 x half> %tmp25, <7 x half> %tmp30, <7 x half> %tmp20) ; line:83 col:11
+  %tmp46 = fadd <7 x half> %tmp35, %tmp45 ; line:83 col:8
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 17, i32 0, i32 4)
+  ; CHECK: [[uvec1:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 18, i32 0, i32 4)
+  ; CHECK: [[uvec2:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  %tmp47 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:90 col:29
+  %tmp48 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp47) ; line:90 col:29
+  %tmp49 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp48, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:90 col:29
+  %tmp50 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp49, i32 17) ; line:90 col:29
+  %tmp51 = load <7 x i32>, <7 x i32>* %tmp50 ; line:90 col:29
+  %tmp52 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:91 col:29
+  %tmp53 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp52) ; line:91 col:29
+  %tmp54 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp53, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:91 col:29
+  %tmp55 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp54, i32 18) ; line:91 col:29
+  %tmp56 = load <7 x i32>, <7 x i32>* %tmp55 ; line:91 col:29
+
+  ; Unsigned int sign operation.
+  ; CHECK: [[cmp:%.*]] = icmp ne <7 x i32> [[uvec2]], zeroinitializer
+  ; CHECK: zext <7 x i1> [[cmp]] to <7 x i32>
+  %tmp57 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>)"(i32 355, <7 x i32> %tmp56) ; line:96 col:12
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i64 @dx.op.rawBufferVectorLoad.v7i64(i32 303, %dx.types.Handle {{%.*}}, i32 19, i32 0, i32 8)
+  ; CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.v7i64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i64 @dx.op.rawBufferVectorLoad.v7i64(i32 303, %dx.types.Handle {{%.*}}, i32 20, i32 0, i32 8)
+  ; CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.v7i64 [[ld]], 0
+  %tmp58 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:102 col:32
+  %tmp59 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp58) ; line:102 col:32
+  %tmp60 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp59, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:102 col:32
+  %tmp61 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp60, i32 19) ; line:102 col:32
+  %tmp62 = load <7 x i64>, <7 x i64>* %tmp61 ; line:102 col:32
+  %tmp63 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:103 col:32
+  %tmp64 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp63) ; line:103 col:32
+  %tmp65 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp64, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:103 col:32
+  %tmp66 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp65, i32 20) ; line:103 col:32
+  %tmp67 = load <7 x i64>, <7 x i64>* %tmp66 ; line:103 col:32
+
+  ; Signed int sign operation.
+  ; CHECK: [[lt1:%.*]] = icmp slt <7 x i64> zeroinitializer, [[lvec2]]
+  ; CHECK: [[lt2:%.*]] = icmp slt <7 x i64> [[lvec2]], zeroinitializer
+  ; CHECK: [[ilt1:%.*]] = zext <7 x i1> [[lt1]] to <7 x i32>
+  ; CHECK: [[ilt2:%.*]] = zext <7 x i1> [[lt2]] to <7 x i32>
+  ; CHECK: sub <7 x i32> [[ilt1]], [[ilt2]]
+  %tmp68 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i64>)"(i32 185, <7 x i64> %tmp67) ; line:110 col:12
+  %tmp69 = mul <7 x i32> %tmp57, %tmp68 ; line:110 col:9
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 21, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec1:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 22, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec2:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 23, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec3:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  %tmp70 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:126 col:29
+  %tmp71 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp70) ; line:126 col:29
+  %tmp72 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp71, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:126 col:29
+  %tmp73 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp72, i32 21) ; line:126 col:29
+  %tmp74 = load <7 x i32>, <7 x i32>* %tmp73 ; line:126 col:29
+  %tmp75 = icmp ne <7 x i32> %tmp74, zeroinitializer ; line:126 col:29
+  %tmp76 = zext <7 x i1> %tmp75 to <7 x i32> ; line:126 col:21
+  %tmp77 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:127 col:29
+  %tmp78 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp77) ; line:127 col:29
+  %tmp79 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp78, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:127 col:29
+  %tmp80 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp79, i32 22) ; line:127 col:29
+  %tmp81 = load <7 x i32>, <7 x i32>* %tmp80 ; line:127 col:29
+  %tmp82 = icmp ne <7 x i32> %tmp81, zeroinitializer ; line:127 col:29
+  %tmp83 = zext <7 x i1> %tmp82 to <7 x i32> ; line:127 col:21
+  %tmp84 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:128 col:29
+  %tmp85 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp84) ; line:128 col:29
+  %tmp86 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp85, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:128 col:29
+  %tmp87 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp86, i32 23) ; line:128 col:29
+  %tmp88 = load <7 x i32>, <7 x i32>* %tmp87 ; line:128 col:29
+  %tmp89 = icmp ne <7 x i32> %tmp88, zeroinitializer ; line:128 col:29
+  %tmp90 = zext <7 x i1> %tmp89 to <7 x i32> ; line:128 col:21
+
+
+  ; Or() operation.
+  ; CHECK: [[bvec2:%.*]] = icmp ne <7 x i32> [[vec2]], zeroinitializer
+  ; CHECK: [[bvec1:%.*]] = icmp ne <7 x i32> [[vec1]], zeroinitializer
+  ; CHECK: or <7 x i1> [[bvec1]], [[bvec2]]
+  %tmp91 = icmp ne <7 x i32> %tmp83, zeroinitializer ; line:133 col:21
+  %tmp92 = icmp ne <7 x i32> %tmp76, zeroinitializer ; line:133 col:14
+  %tmp93 = call <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32 169, <7 x i1> %tmp92, <7 x i1> %tmp91) ; line:133 col:11
+  %tmp94 = zext <7 x i1> %tmp93 to <7 x i32> ; line:133 col:11
+  %tmp95 = add <7 x i32> %tmp69, %tmp94 ; line:133 col:8
+
+  ; And() operation.
+  ; CHECK: [[bvec3:%.*]] = icmp ne <7 x i32> [[vec3]], zeroinitializer
+  ; CHECK: [[bvec2:%.*]] = icmp ne <7 x i32> [[vec2]], zeroinitializer
+  ; CHECK: and <7 x i1> [[bvec2]], [[bvec3]]
+  %tmp96 = icmp ne <7 x i32> %tmp90, zeroinitializer ; line:137 col:22
+  %tmp97 = icmp ne <7 x i32> %tmp83, zeroinitializer ; line:137 col:15
+  %tmp98 = call <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32 106, <7 x i1> %tmp97, <7 x i1> %tmp96) ; line:137 col:11
+  %tmp99 = zext <7 x i1> %tmp98 to <7 x i32> ; line:137 col:11
+  %tmp100 = add <7 x i32> %tmp95, %tmp99 ; line:137 col:8
+
+  ; Select() operation.
+  ; CHECK: [[bvec3:%.*]] = icmp ne <7 x i32> [[vec3]], zeroinitializer
+  ; CHECK: select <7 x i1> [[bvec3]], <7 x i64> [[lvec1]], <7 x i64> [[lvec2]]
+  %tmp101 = icmp ne <7 x i32> %tmp90, zeroinitializer ; line:140 col:38
+  %tmp102 = call <7 x i64> @"dx.hl.op.rn.<7 x i64> (i32, <7 x i1>, <7 x i64>, <7 x i64>)"(i32 184, <7 x i1> %tmp101, <7 x i64> %tmp62, <7 x i64> %tmp67) ; line:140 col:31
+  %tmp103 = call float @"dx.hl.op.rn.float (i32, <7 x float>, <7 x float>)"(i32 134, <7 x float> %tmp4, <7 x float> %tmp9) ; line:152 col:11
+
+  ; Dot operation.
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 0
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 0
+  ; CHECK: [[mul:%.*]] = fmul fast float [[el1]], [[el2]]
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 1
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 1
+  ; CHECK: [[mad1:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mul]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 2
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 2
+  ; CHECK: [[mad2:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad1]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 3
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 3
+  ; CHECK: [[mad3:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad2]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 4
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 4
+  ; CHECK: [[mad4:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad3]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 5
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 5
+  ; CHECK: [[mad5:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad4]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 6
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 6
+  ; CHECK: call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad5]])
+  %tmp104 = insertelement <7 x float> undef, float %tmp103, i32 0 ; line:152 col:11
+  %tmp105 = shufflevector <7 x float> %tmp104, <7 x float> undef, <7 x i32> zeroinitializer ; line:152 col:11
+  %tmp106 = fadd <7 x float> %tmp44, %tmp105 ; line:152 col:8
+
+  ; Atan operation.
+  ; CHECK: call <7 x float> @dx.op.unary.v7f32(i32 17, <7 x float> [[fvec1]])
+  %tmp107 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 116, <7 x float> %tmp4) ; line:155 col:11
+  %tmp108 = fadd <7 x float> %tmp106, %tmp107 ; line:155 col:8
+
+  ; Min operation.
+  ; CHECK: call <7 x i32> @dx.op.binary.v7i32(i32 40, <7 x i32> [[uvec1]], <7 x i32> [[uvec2]])
+  %tmp109 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>, <7 x i32>)"(i32 353, <7 x i32> %tmp51, <7 x i32> %tmp56) ; line:158 col:11
+  %tmp110 = add <7 x i32> %tmp100, %tmp109 ; line:158 col:8
+
+  ; Mad operation.
+  ; CHECK: call <7 x float> @dx.op.tertiary.v7f32(i32 46, <7 x float> [[fvec1]], <7 x float> [[fvec2]], <7 x float> [[fvec3]])
+  %tmp111 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 162, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:161 col:11
+  %tmp112 = fadd <7 x float> %tmp108, %tmp111 ; line:161 col:8
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 24, i32 0, i32 8) 
+  ; CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 25, i32 0, i32 8) 
+  ; CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 26, i32 0, i32 8) 
+  ; CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  %tmp113 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:169 col:31
+  %tmp114 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp113) ; line:169 col:31
+  %tmp115 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp114, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:169 col:31
+  %tmp116 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp115, i32 24) ; line:169 col:31
+  %tmp117 = load <7 x double>, <7 x double>* %tmp116 ; line:169 col:31
+  %tmp118 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:170 col:31
+  %tmp119 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp118) ; line:170 col:31
+  %tmp120 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp119, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:170 col:31
+  %tmp121 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp120, i32 25) ; line:170 col:31
+  %tmp122 = load <7 x double>, <7 x double>* %tmp121 ; line:170 col:31
+  %tmp123 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:171 col:31
+  %tmp124 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp123) ; line:171 col:31
+  %tmp125 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp124, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:171 col:31
+  %tmp126 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp125, i32 26) ; line:171 col:31
+  %tmp127 = load <7 x double>, <7 x double>* %tmp126 ; line:171 col:31
+
+  ; FMA operation.
+  ; CHECK: call <7 x double> @dx.op.tertiary.v7f64(i32 47, <7 x double> [[dvec1]], <7 x double> [[dvec2]], <7 x double> [[dvec3]])
+  %tmp128 = call <7 x double> @"dx.hl.op.rn.<7 x double> (i32, <7 x double>, <7 x double>, <7 x double>)"(i32 147, <7 x double> %tmp117, <7 x double> %tmp122, <7 x double> %tmp127) ; line:174 col:30
+  %tmp129 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:176 col:3
+  %tmp130 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp129) ; line:176 col:3
+  %tmp131 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp130, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:176 col:3
+  %tmp132 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp131, i32 0) ; line:176 col:3
+  store <7 x half> %tmp46, <7 x half>* %tmp132 ; line:176 col:11
+  %tmp133 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:177 col:3
+  %tmp134 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp133) ; line:177 col:3
+  %tmp135 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp134, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:177 col:3
+  %tmp136 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp135, i32 0) ; line:177 col:3
+  store <7 x float> %tmp112, <7 x float>* %tmp136 ; line:177 col:11
+  %tmp137 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:178 col:3
+  %tmp138 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp137) ; line:178 col:3
+  %tmp139 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp138, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:178 col:3
+  %tmp140 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp139, i32 0) ; line:178 col:3
+  store <7 x double> %tmp128, <7 x double>* %tmp140 ; line:178 col:11
+  %tmp141 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:179 col:3
+  %tmp142 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp141) ; line:179 col:3
+  %tmp143 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp142, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:179 col:3
+  %tmp144 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp143, i32 0) ; line:179 col:3
+  store <7 x i32> %tmp110, <7 x i32>* %tmp144 ; line:179 col:11
+  %tmp145 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:180 col:3
+  %tmp146 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp145) ; line:180 col:3
+  %tmp147 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp146, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:180 col:3
+  %tmp148 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp147, i32 0) ; line:180 col:3
+  store <7 x i64> %tmp102, <7 x i64>* %tmp148 ; line:180 col:11
+  ret void ; line:181 col:1
+}
+
+declare <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<float, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<float, 7> >") #1
+declare <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32, <7 x float>, <7 x float>, <7 x float>) #1
+declare <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<half, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<half, 7> >") #1
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>)"(i32, <7 x half>, <7 x half>) #1
+declare <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32, <7 x float>) #1
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>)"(i32, <7 x half>) #1
+declare <7 x float> @"dx.hl.op..<7 x float> (i32, <7 x float>, <7 x float>*)"(i32, <7 x float>, <7 x float>*) #0
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>, <7 x half>)"(i32, <7 x half>, <7 x half>, <7 x half>) #1
+declare <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<unsigned int, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<unsigned int, 7> >") #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>)"(i32, <7 x i32>) #1
+declare <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<long long, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<long long, 7> >") #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i64>)"(i32, <7 x i64>) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<bool, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<bool, 7> >") #1
+declare <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32, <7 x i1>, <7 x i1>) #1
+declare <7 x i64> @"dx.hl.op.rn.<7 x i64> (i32, <7 x i1>, <7 x i64>, <7 x i64>)"(i32, <7 x i1>, <7 x i64>, <7 x i64>) #1
+declare float @"dx.hl.op.rn.float (i32, <7 x float>, <7 x float>)"(i32, <7 x float>, <7 x float>) #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>, <7 x i32>)"(i32, <7 x i32>, <7 x i32>) #1
+declare <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<double, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<double, 7> >") #1
+declare <7 x double> @"dx.hl.op.rn.<7 x double> (i32, <7 x double>, <7 x double>, <7 x double>)"(i32, <7 x double>, <7 x double>, <7 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!pauseresume = !{!1}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !36}
+!dx.entryPoints = !{!40}
+!dx.fnprops = !{!52}
+!dx.options = !{!53, !54}
+
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"cs", i32 6, i32 9}
+!5 = !{i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" undef, !6, %"class.RWStructuredBuffer<vector<float, 7> >" undef, !11, %"class.RWStructuredBuffer<vector<double, 7> >" undef, !16, %"class.RWStructuredBuffer<vector<bool, 7> >" undef, !21, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" undef, !26, %"class.RWStructuredBuffer<vector<long long, 7> >" undef, !31}
+!6 = !{i32 14, !7, !8}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 8, i32 13, i32 7}
+!8 = !{i32 0, !9}
+!9 = !{!10}
+!10 = !{i32 0, <7 x half> undef}
+!11 = !{i32 28, !12, !13}
+!12 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9, i32 13, i32 7}
+!13 = !{i32 0, !14}
+!14 = !{!15}
+!15 = !{i32 0, <7 x float> undef}
+!16 = !{i32 56, !17, !18}
+!17 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 10, i32 13, i32 7}
+!18 = !{i32 0, !19}
+!19 = !{!20}
+!20 = !{i32 0, <7 x double> undef}
+!21 = !{i32 28, !22, !23}
+!22 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 1, i32 13, i32 7}
+!23 = !{i32 0, !24}
+!24 = !{!25}
+!25 = !{i32 0, <7 x i1> undef}
+!26 = !{i32 28, !27, !28}
+!27 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5, i32 13, i32 7}
+!28 = !{i32 0, !29}
+!29 = !{!30}
+!30 = !{i32 0, <7 x i32> undef}
+!31 = !{i32 56, !32, !33}
+!32 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 6, i32 13, i32 7}
+!33 = !{i32 0, !34}
+!34 = !{!35}
+!35 = !{i32 0, <7 x i64> undef}
+!36 = !{i32 1, void ()* @main, !37}
+!37 = !{!38}
+!38 = !{i32 1, !39, !39}
+!39 = !{}
+!40 = !{void ()* @main, !"main", null, !41, null}
+!41 = !{null, !42, null, null}
+!42 = !{!43, !45, !47, !49, !50, !51}
+!43 = !{i32 0, %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A", !"hBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !44}
+!44 = !{i32 1, i32 14}
+!45 = !{i32 1, %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A", !"fBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!46 = !{i32 1, i32 28}
+!47 = !{i32 2, %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A", !"dBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !48}
+!48 = !{i32 1, i32 56}
+!49 = !{i32 3, %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A", !"bBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!50 = !{i32 4, %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A", !"uBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!51 = !{i32 5, %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A", !"lBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !48}
+!52 = !{void ()* @main, i32 5, i32 8, i32 1, i32 1}
+!53 = !{i32 0}
+!54 = !{i32 -1}
+!59 = !{!60, !60, i64 0}
+!60 = !{!"omnipotent char", !61, i64 0}
+!61 = !{!"Simple C/C++ TBAA"}
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 691c3ba58f..548aae4192 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -1503,7 +1503,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the " + i,
-                "hfd",
+                "hfd<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1537,7 +1537,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the " + i,
-                "hf",
+                "hf<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1554,7 +1554,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the reverse bit pattern of the input value",
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1601,7 +1601,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Binary",
                 "returns the " + i + " of the input values",
-                "hfd",
+                "hfd<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1619,7 +1619,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Binary",
                 "returns the " + i + " of the input values",
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1674,7 +1674,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Tertiary",
             "performs a fused multiply add (FMA) of the form a * b + c",
-            "hfd",
+            "hfd<",
             "rn",
             [
                 db_dxil_param(
@@ -1691,7 +1691,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Tertiary",
             "performs a fused multiply add (FMA) of the form a * b + c",
-            "d",
+            "d<",
             "rn",
             [
                 db_dxil_param(
@@ -1715,7 +1715,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Tertiary",
                 "performs an integral " + i,
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "the operation result"),
@@ -2608,7 +2608,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2626,7 +2626,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2644,7 +2644,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2662,7 +2662,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(