diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index 0b4c7218d4..7047d9fe59 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -96,16 +96,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::Saturate,
      "Saturate",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::IsNaN,
      "IsNaN",
      OCC::IsSpecialFloat,
@@ -144,112 +144,112 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Sin,
      "Sin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Tan,
      "Tan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Acos,
      "Acos",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Asin,
      "Asin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Atan,
      "Atan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Hcos,
      "Hcos",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Hsin,
      "Hsin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Htan,
      "Htan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Exp,
      "Exp",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Frc,
      "Frc",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Log,
      "Log",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Sqrt,
      "Sqrt",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Rsqrt,
      "Rsqrt",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Unary float - rounding
     {OC::Round_ne,
@@ -258,32 +258,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_ni,
      "Round_ni",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_pi,
      "Round_pi",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_z,
      "Round_z",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Unary int
     {OC::Bfrev,
@@ -292,8 +292,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::Countbits,
      "Countbits",
      OCC::UnaryBits,
@@ -338,16 +338,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::FMin,
      "FMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
 
     // Binary int
     {OC::IMax,
@@ -356,16 +356,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::IMin,
      "IMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Binary uint
     {OC::UMax,
@@ -374,16 +374,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::UMin,
      "UMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Binary int with two outputs
     {OC::IMul,
@@ -438,16 +438,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::Fma,
      "Fma",
      OCC::Tertiary,
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0x4}},
-     {{0x0}}}, // Overloads: d
+     {{0x404}},
+     {{0x4}}}, // Overloads: d<d
 
     // Tertiary int
     {OC::IMad,
@@ -456,8 +456,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Tertiary uint
     {OC::UMad,
@@ -466,8 +466,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Tertiary int
     {OC::Msad,
@@ -764,32 +764,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivCoarseY,
      "DerivCoarseY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineX,
      "DerivFineX",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineY,
      "DerivFineY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Pixel shader
     {OC::EvalSnapped,
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 4d8201df8d..c0f9d7fddd 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -424,6 +424,14 @@ struct IntrinsicLower {
 // IOP intrinsics.
 namespace {
 
+// Creates the necessary scalar calls to for a "trivial" operation where only
+// call instructions to a single function type are needed.
+// The overload type `Ty` determines what scalarization might be required.
+// Elements of any vectors in `refArgs` are extracted  into scalars for each
+// call generated while the same scalar values are used unaltered in each call.
+// Utility objects `HlslOp` and `Builder` are used to generate calls to the
+// given `DxilFunc` for each set of scalar arguments.
+// The results are reconstructed into the given `RetTy` as needed.
 Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode,
                             ArrayRef<Value *> refArgs, Type *Ty, Type *RetTy,
                             OP *hlslOP, IRBuilder<> &Builder) {
@@ -459,12 +467,40 @@ Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode,
     }
   }
 }
-// Generates a DXIL operation over an overloaded type (Ty), returning a
-// RetTy value; when Ty is a vector, it will replicate per-element operations
-// into RetTy to rebuild it.
+
+// Creates a native vector call to for a "trivial" operation where only a single
+// call instruction is needed. The overload and return types are the same vector
+// type `Ty`.
+// Utility objects `HlslOp` and `Builder` are used to create a call to the given
+// `DxilFunc` with `RefArgs` arguments.
+Value *TrivialDxilVectorOperation(Function *Func, OP::OpCode Opcode,
+                                  ArrayRef<Value *> Args, Type *Ty, OP *OP,
+                                  IRBuilder<> &Builder) {
+  if (!Ty->isVoidTy())
+    return Builder.CreateCall(Func, Args, OP->GetOpCodeName(Opcode));
+  return Builder.CreateCall(Func, Args); // Cannot add name to void.
+}
+
+// Generates a DXIL operation with the overloaded type based on `Ty` and return
+// type `RetTy`. When Ty is a vector, it will either generate per-element calls
+// for each vector element and reconstruct the vector type from those results or
+// operate on and return native vectors depending on vector size and the
+// legality of the vector overload.
 Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
                             Type *Ty, Type *RetTy, OP *hlslOP,
                             IRBuilder<> &Builder) {
+
+  // If supported and the overload type is a vector with more than 1 element,
+  // create a native vector operation.
+  if (Ty->isVectorTy() && Ty->getVectorNumElements() > 1 &&
+      hlslOP->GetModule()->GetHLModule().GetShaderModel()->IsSM69Plus() &&
+      OP::IsOverloadLegal(opcode, Ty)) {
+    Function *dxilFunc = hlslOP->GetOpFunc(opcode, Ty);
+    return TrivialDxilVectorOperation(dxilFunc, opcode, refArgs, Ty, hlslOP,
+                                      Builder);
+  }
+
+  // Set overload type to the scalar type of `Ty` and generate call(s).
   Type *EltTy = Ty->getScalarType();
   Function *dxilFunc = hlslOP->GetOpFunc(opcode, EltTy);
 
@@ -484,20 +520,34 @@ Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
   return TrivialDxilOperation(opcode, refArgs, Ty, Inst->getType(), hlslOP, B);
 }
 
-Value *TrivialDxilUnaryOperationRet(OP::OpCode opcode, Value *src, Type *RetTy,
-                                    hlsl::OP *hlslOP, IRBuilder<> &Builder) {
-  Type *Ty = src->getType();
+// Translate call that converts to a dxil unary operation with a different
+// return type from the overload by passing the argument, explicit return type,
+// and helper objects to the scalarizing unary dxil operation creation.
+Value *TrivialUnaryOperationRet(CallInst *CI, IntrinsicOp IOP,
+                                OP::OpCode OpCode,
+                                HLOperationLowerHelper &Helper,
+                                HLObjectOperationLowerHelper *,
+                                bool &Translated) {
+  Value *Src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Type *Ty = Src->getType();
 
-  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, src};
+  IRBuilder<> Builder(CI);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Type *RetTy = CI->getType();
+  Constant *OpArg = OP->GetU32Const((unsigned)OpCode);
+  Value *Args[] = {OpArg, Src};
 
-  return TrivialDxilOperation(opcode, args, Ty, RetTy, hlslOP, Builder);
+  return TrivialDxilOperation(OpCode, Args, Ty, RetTy, OP, Builder);
 }
 
-Value *TrivialDxilUnaryOperation(OP::OpCode opcode, Value *src,
-                                 hlsl::OP *hlslOP, IRBuilder<> &Builder) {
-  return TrivialDxilUnaryOperationRet(opcode, src, src->getType(), hlslOP,
-                                      Builder);
+Value *TrivialDxilUnaryOperation(OP::OpCode OpCode, Value *Src, hlsl::OP *Op,
+                                 IRBuilder<> &Builder) {
+  Type *Ty = Src->getType();
+
+  Constant *OpArg = Op->GetU32Const((unsigned)OpCode);
+  Value *Args[] = {OpArg, Src};
+
+  return TrivialDxilOperation(OpCode, Args, Ty, Ty, Op, Builder);
 }
 
 Value *TrivialDxilBinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
@@ -521,6 +571,9 @@ Value *TrivialDxilTrinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
   return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder);
 }
 
+// Translate call that trivially converts to a dxil unary operation by passing
+// argument, return type, and helper objects to either scalarizing or native
+// vector dxil operation creation depending on version and vector size.
 Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                              HLOperationLowerHelper &helper,
                              HLObjectOperationLowerHelper *pObjHelper,
@@ -528,11 +581,13 @@ Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *src0 = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
   IRBuilder<> Builder(CI);
   hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *retVal = TrivialDxilUnaryOperationRet(opcode, src0, CI->getType(),
-                                               hlslOP, Builder);
-  return retVal;
+
+  return TrivialDxilUnaryOperation(opcode, src0, hlslOP, Builder);
 }
 
+// Translate call that trivially converts to a dxil binary operation by passing
+// arguments, return type, and helper objects to either scalarizing or native
+// vector dxil operation creation depending on version and vector size.
 Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                               HLOperationLowerHelper &helper,
                               HLObjectOperationLowerHelper *pObjHelper,
@@ -547,6 +602,10 @@ Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return binOp;
 }
 
+// Translate call that trivially converts to a dxil trinary (aka tertiary)
+// operation by passing arguments, return type, and helper objects to either
+// scalarizing or native vector dxil operation creation depending on version
+// and vector size.
 Value *TrivialTrinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                                HLOperationLowerHelper &helper,
                                HLObjectOperationLowerHelper *pObjHelper,
@@ -738,6 +797,12 @@ bool CanUseFxcMulOnlyPatternForPow(IRBuilder<> &Builder, Value *x, Value *pow,
     }
   }
 
+  // Only apply on aggregates of 16 or fewer elements,
+  // representing the max 4x4 matrix size.
+  Type *Ty = x->getType();
+  if (Ty->isVectorTy() && Ty->getVectorNumElements() > 16)
+    return false;
+
   APFloat powAPF = isa<ConstantDataVector>(pow)
                        ? cast<ConstantDataVector>(pow)->getElementAsAPFloat(0)
                        : // should be a splat value
@@ -2019,7 +2084,7 @@ Value *TranslateFirstbitHi(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                            HLObjectOperationLowerHelper *pObjHelper,
                            bool &Translated) {
   Value *firstbitHi =
-      TrivialUnaryOperation(CI, IOP, opcode, helper, pObjHelper, Translated);
+      TrivialUnaryOperationRet(CI, IOP, opcode, helper, pObjHelper, Translated);
   // firstbitHi == -1? -1 : (bitWidth-1 -firstbitHi);
   IRBuilder<> Builder(CI);
   Constant *neg1 = Builder.getInt32(-1);
@@ -2052,7 +2117,7 @@ Value *TranslateFirstbitLo(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                            HLObjectOperationLowerHelper *pObjHelper,
                            bool &Translated) {
   Value *firstbitLo =
-      TrivialUnaryOperation(CI, IOP, opcode, helper, pObjHelper, Translated);
+      TrivialUnaryOperationRet(CI, IOP, opcode, helper, pObjHelper, Translated);
   return firstbitLo;
 }
 
@@ -2431,17 +2496,22 @@ Value *TrivialDotOperation(OP::OpCode opcode, Value *src0, Value *src1,
   return dotOP;
 }
 
-Value *TranslateIDot(Value *arg0, Value *arg1, unsigned vecSize,
-                     hlsl::OP *hlslOP, IRBuilder<> &Builder,
-                     bool Unsigned = false) {
-  auto madOpCode = Unsigned ? DXIL::OpCode::UMad : DXIL::OpCode::IMad;
+// Instead of using a DXIL intrinsic, implement a dot product operation using
+// multiply and add operations. Used for integer dots and long vectors.
+Value *ExpandDot(Value *arg0, Value *arg1, unsigned vecSize, hlsl::OP *hlslOP,
+                 IRBuilder<> &Builder,
+                 DXIL::OpCode MadOpCode = DXIL::OpCode::IMad) {
   Value *Elt0 = Builder.CreateExtractElement(arg0, (uint64_t)0);
   Value *Elt1 = Builder.CreateExtractElement(arg1, (uint64_t)0);
-  Value *Result = Builder.CreateMul(Elt0, Elt1);
-  for (unsigned iVecElt = 1; iVecElt < vecSize; ++iVecElt) {
-    Elt0 = Builder.CreateExtractElement(arg0, iVecElt);
-    Elt1 = Builder.CreateExtractElement(arg1, iVecElt);
-    Result = TrivialDxilTrinaryOperation(madOpCode, Elt0, Elt1, Result, hlslOP,
+  Value *Result;
+  if (Elt0->getType()->isFloatingPointTy())
+    Result = Builder.CreateFMul(Elt0, Elt1);
+  else
+    Result = Builder.CreateMul(Elt0, Elt1);
+  for (unsigned Elt = 1; Elt < vecSize; ++Elt) {
+    Elt0 = Builder.CreateExtractElement(arg0, Elt);
+    Elt1 = Builder.CreateExtractElement(arg1, Elt);
+    Result = TrivialDxilTrinaryOperation(MadOpCode, Elt0, Elt1, Result, hlslOP,
                                          Builder);
   }
 
@@ -2480,12 +2550,16 @@ Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   unsigned vecSize = Ty->getVectorNumElements();
   Value *arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
   IRBuilder<> Builder(CI);
-  if (Ty->getScalarType()->isFloatingPointTy()) {
+  Type *EltTy = Ty->getScalarType();
+  if (EltTy->isFloatingPointTy() && Ty->getVectorNumElements() <= 4)
     return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
-  } else {
-    return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
-                         IOP == IntrinsicOp::IOP_udot);
-  }
+
+  DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
+  if (IOP == IntrinsicOp::IOP_udot)
+    MadOpCode = DXIL::OpCode::UMad;
+  else if (EltTy->isFloatingPointTy())
+    MadOpCode = DXIL::OpCode::FMad;
+  return ExpandDot(arg0, arg1, vecSize, hlslOP, Builder, MadOpCode);
 }
 
 Value *TranslateNormalize(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -3032,8 +3106,10 @@ Value *TranslateMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
       if (arg0Ty->getScalarType()->isFloatingPointTy()) {
         return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
       } else {
-        return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
-                             IOP == IntrinsicOp::IOP_umul);
+        DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
+        if (IOP == IntrinsicOp::IOP_umul)
+          MadOpCode = DXIL::OpCode::UMad;
+        return ExpandDot(arg0, arg1, vecSize, hlslOP, Builder, MadOpCode);
       }
     } else {
       // mul(vector, scalar) == vector * scalar-splat
@@ -6150,20 +6226,8 @@ Value *TranslateAnd(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                     bool &Translated) {
   Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltX = Builder.CreateExtractElement(x, i);
-      Value *EltY = Builder.CreateExtractElement(y, i);
-      Value *tmp = Builder.CreateAnd(EltX, EltY);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateAnd(x, y);
 }
 Value *TranslateOr(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -6171,20 +6235,8 @@ Value *TranslateOr(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                    HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
   Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltX = Builder.CreateExtractElement(x, i);
-      Value *EltY = Builder.CreateExtractElement(y, i);
-      Value *tmp = Builder.CreateOr(EltX, EltY);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateOr(x, y);
 }
 Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -6194,21 +6246,8 @@ Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *cond = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
   Value *t = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
   Value *f = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltCond = Builder.CreateExtractElement(cond, i);
-      Value *EltTrue = Builder.CreateExtractElement(t, i);
-      Value *EltFalse = Builder.CreateExtractElement(f, i);
-      Value *tmp = Builder.CreateSelect(EltCond, EltTrue, EltFalse);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateSelect(cond, t, f);
 }
 } // namespace
@@ -6467,18 +6506,20 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP_clip, TranslateClip, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_cos, TrivialUnaryOperation, DXIL::OpCode::Cos},
     {IntrinsicOp::IOP_cosh, TrivialUnaryOperation, DXIL::OpCode::Hcos},
-    {IntrinsicOp::IOP_countbits, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_countbits, TrivialUnaryOperationRet,
      DXIL::OpCode::Countbits},
     {IntrinsicOp::IOP_cross, TranslateCross, DXIL::OpCode::NumOpCodes},
-    {IntrinsicOp::IOP_ddx, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddx, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperationRet,
+     DXIL::OpCode::DerivCoarseX},
+    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivFineX},
-    {IntrinsicOp::IOP_ddy, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddy, TrivialUnaryOperationRet,
+     DXIL::OpCode::DerivCoarseY},
+    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivFineY},
     {IntrinsicOp::IOP_degrees, TranslateDegrees, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_determinant, EmptyLower, DXIL::OpCode::NumOpCodes},
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 6796badcb6..72dd6d41aa 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -6606,8 +6606,10 @@ bool HLSLExternalSource::MatchArguments(
   argTypes.clear();
   const bool isVariadic = IsVariadicIntrinsicFunction(pIntrinsic);
 
-  static const UINT UnusedSize = 0xFF;
-  static const BYTE MaxIntrinsicArgs = g_MaxIntrinsicParamCount + 1;
+  static const uint32_t UnusedSize = std::numeric_limits<uint32_t>::max();
+  static const uint32_t MaxIntrinsicArgs = g_MaxIntrinsicParamCount + 1;
+  assert(MaxIntrinsicArgs < std::numeric_limits<uint8_t>::max() &&
+         "This should be a pretty small number");
 #define CAB(cond, arg)                                                         \
   {                                                                            \
     if (!(cond)) {                                                             \
@@ -6622,7 +6624,7 @@ bool HLSLExternalSource::MatchArguments(
   ArBasicKind
       ComponentType[MaxIntrinsicArgs]; // Component type for each argument,
                                        // AR_BASIC_UNKNOWN if unspecified.
-  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UNUSED_INDEX32
+  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UnusedSize
                                        // if unspecified.
   badArgIdx = MaxIntrinsicArgs;
 
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
new file mode 100644
index 0000000000..0b7f0d6b2f
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
@@ -0,0 +1,394 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=2   %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=7   %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=125 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=256 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=1024 %s | FileCheck %s
+
+// Test vector-enabled non-trivial intrinsics that take parameters of various types.
+
+RWByteAddressBuffer buf;
+RWByteAddressBuffer ibuf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle {{%.*}}, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+  vector<float16_t, NUM> hVec3 = buf.Load<vector<float16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+  vector<float, NUM> fVec3 = buf.Load<vector<float, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+  vector<double, NUM> dVec3 = buf.Load<vector<double, NUM> >(5120);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle {{%.*}}, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[svec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = ibuf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = ibuf.Load<vector<int16_t, NUM> >(512);
+  vector<int16_t, NUM> sVec3 = ibuf.Load<vector<int16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1025
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[usvec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = ibuf.Load<vector<uint16_t, NUM> >(1025);
+  vector<uint16_t, NUM> usVec2 = ibuf.Load<vector<uint16_t, NUM> >(1536);
+  vector<uint16_t, NUM> usVec3 = ibuf.Load<vector<uint16_t, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2049
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[ivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = ibuf.Load<vector<int, NUM> >(2049);
+  vector<int, NUM> iVec2 = ibuf.Load<vector<int, NUM> >(2560);
+  vector<int, NUM> iVec3 = ibuf.Load<vector<int, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3073
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[uivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = ibuf.Load<vector<uint, NUM> >(3073);
+  vector<uint, NUM> uiVec2 = ibuf.Load<vector<uint, NUM> >(3584);
+  vector<uint, NUM> uiVec3 = ibuf.Load<vector<uint, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4097
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[lvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = ibuf.Load<vector<int64_t, NUM> >(4097);
+  vector<int64_t, NUM> lVec2 = ibuf.Load<vector<int64_t, NUM> >(4608);
+  vector<int64_t, NUM> lVec3 = ibuf.Load<vector<int64_t, NUM> >(5120);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5121
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 6144
+  // CHECK: [[ulvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = ibuf.Load<vector<uint64_t, NUM> >(5121);
+  vector<uint64_t, NUM> ulVec2 = ibuf.Load<vector<uint64_t, NUM> >(5632);
+  vector<uint64_t, NUM> ulVec3 = ibuf.Load<vector<uint64_t, NUM> >(6144);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 35, <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 36, <[[NUM]] x half> [[tmp]], <[[NUM]] x half> [[hvec3]])  ; FMin(a,b)
+  vector<float16_t, NUM> hRes = clamp(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 35, <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 36, <[[NUM]] x float> [[tmp]], <[[NUM]] x float> [[fvec3]])  ; FMin(a,b)
+  vector<float, NUM> fRes = clamp(fVec1, fVec2, fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 35, <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 36, <[[NUM]] x double> [[tmp]], <[[NUM]] x double> [[dvec3]])  ; FMin(a,b)
+  vector<double, NUM> dRes = clamp(dVec1, dVec2, dVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 37, <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 38, <[[NUM]] x i16> [[tmp]], <[[NUM]] x i16> [[svec3]])  ; IMin(a,b)
+  vector<int16_t, NUM> sRes = clamp(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 39, <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 40, <[[NUM]] x i16> [[tmp]], <[[NUM]] x i16> [[usvec3]])  ; UMin(a,b)
+  vector<uint16_t, NUM> usRes = clamp(usVec1, usVec2, usVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 37, <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 38, <[[NUM]] x i32> [[tmp]], <[[NUM]] x i32> [[ivec3]])  ; IMin(a,b)
+  vector<int, NUM> iRes = clamp(iVec1, iVec2, iVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 39, <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 40, <[[NUM]] x i32> [[tmp]], <[[NUM]] x i32> [[uivec3]])  ; UMin(a,b)
+  vector<uint, NUM> uiRes = clamp(uiVec1, uiVec2, uiVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 37, <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 38, <[[NUM]] x i64> [[tmp]], <[[NUM]] x i64> [[lvec3]])  ; IMin(a,b)
+  vector<int64_t, NUM> lRes = clamp(lVec1, lVec2, lVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 39, <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 40, <[[NUM]] x i64> [[tmp]], <[[NUM]] x i64> [[ulvec3]])  ; UMin(a,b)
+  vector<uint64_t, NUM> ulRes = clamp(ulVec1, ulVec2, ulVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <[[NUM]] x half> [[hvec2]], [[hvec1]]
+  // CHECK: select <[[NUM]] x i1> [[tmp]], <[[NUM]] x half> zeroinitializer, <[[NUM]] x half> <half 0xH3C00
+  hRes += step(hVec1, hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <[[NUM]] x float> [[fvec2]], [[fvec1]]
+  // CHECK: select <[[NUM]] x i1> [[tmp]], <[[NUM]] x float> zeroinitializer, <[[NUM]] x float> <float 1
+  fRes += step(fVec1, fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fmul fast <[[NUM]] x half> [[hvec1]], <half 0x
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 21, <[[NUM]] x half> [[tmp]])  ; Exp(value)
+  hRes += exp(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fmul fast <[[NUM]] x float> [[fvec1]], <float 0x
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 21, <[[NUM]] x float> [[tmp]])  ; Exp(value)
+  fRes += exp(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 23, <[[NUM]] x half> [[hvec1]])  ; Log(value)
+  // CHECK: fmul fast <[[NUM]] x half> [[tmp]], <half 0xH398C
+  hRes += log(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 23, <[[NUM]] x float> [[fvec1]])  ; Log(value)
+  // CHECK: fmul fast <[[NUM]] x float> [[tmp]], <float 0x3FE62E4300000000
+  fRes += log(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> [[hvec2]], [[hvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <[[NUM]] x half> [[hvec3]], [[hvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <[[NUM]] x half> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 7, <[[NUM]] x half> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x half> [[sat]], <half 0xH4000,
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> <half 0xH4200, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x half> [[sat]], [[sat]]
+  // CHECK: fmul fast <[[NUM]] x half> [[mul]], [[sub]]
+  hRes += smoothstep(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x float> [[fvec2]], [[fvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <[[NUM]] x float> [[fvec3]], [[fvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <[[NUM]] x float> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 7, <[[NUM]] x float> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x float> [[sat]], <float 2.000000e+00,
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x float> <float 3.000000e+00, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x float> [[sat]], [[sat]]
+  // CHECK: fmul fast <[[NUM]] x float> [[mul]], [[sub]]
+  fRes += smoothstep(fVec1, fVec2, fVec3);
+
+  // Intrinsics that expand into llvm ops.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x half> [[hvec2]], <half 0xH5329
+  hRes += degrees(hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x float> [[fvec2]], <float 0x404CA5DC20000000
+  fRes += degrees(fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x half> [[hvec3]], <half 0xH2478
+  hRes += radians(hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x float> [[fvec3]], <float 0x3F91DF46A0000000
+  fRes += radians(fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = fcmp fast une <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[f2i:%.*]] = bitcast <[[NUM]] x float> [[fvec1]] to <[[NUM]] x i32>
+  // CHECK: [[and:%.*]] = and <[[NUM]] x i32> [[f2i]], <i32 2139095040
+  // CHECK: [[add:%.*]] = add nsw <[[NUM]] x i32> [[and]], <i32 -1056964608
+  // CHECK: [[shr:%.*]] = ashr <[[NUM]] x i32> [[add]], <i32 23
+  // CHECK: [[i2f:%.*]] = sitofp <[[NUM]] x i32> [[shr]] to <[[NUM]] x float>
+  // CHECK: [[sel:%.*]] = select <[[NUM]] x i1> [[cmp]], <[[NUM]] x float> [[i2f]], <[[NUM]] x float> zeroinitializer
+  // CHECK: [[and:%.*]] = and <[[NUM]] x i32> [[f2i]], <i32 8388607
+  // CHECK: or <[[NUM]] x i32> [[and]], <i32 1056964608
+  vector<float, NUM> exp = fVec3;
+  fRes += frexp(fVec1, exp);
+  fRes += exp;
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fsub fast <[[NUM]] x half> [[hvec3]], [[hvec2]]
+  // CHECK: fmul fast <[[NUM]] x half> [[tmp]], [[hvec1]]
+  hRes += lerp(hVec2, hVec3, hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fsub fast <[[NUM]] x float> [[fvec3]], [[fvec2]]
+  // CHECK: fmul fast <[[NUM]] x float> [[tmp]], [[fvec1]]
+  fRes += lerp(fVec2, fVec3, fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fdiv fast <[[NUM]] x half> <half 0xH3C00, {{.*}}>, [[hvec1]]
+  hRes += rcp(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fdiv fast <[[NUM]] x float> <float 1.000000e+00, {{.*}}>, [[fvec1]]
+  fRes += rcp(fVec1);
+
+  vector<uint, NUM> signs = 1;
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x half> [[hvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x half> [[hvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x double> [[dvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x double> [[dvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(dVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i16> [[usvec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(usVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i32> [[ivec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i32> [[ivec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: [[sub:%.*]] = sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(iVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i32> [[uivec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(uiVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(lVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i64> [[ulvec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(ulVec2);
+
+  iRes += signs;
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[bvec2:%.*]] = icmp ne <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[bvec1:%.*]] = icmp ne <[[NUM]] x i16> [[svec1]], zeroinitializer
+  // CHECK: or <[[NUM]] x i1> [[bvec2]], [[bvec1]]
+  sRes += or(sVec1, sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[bvec3:%.*]] = icmp ne <[[NUM]] x i16> [[svec3]], zeroinitializer
+  // CHECK: and <[[NUM]] x i1> [[bvec3]], [[bvec2]]
+  sRes += and(sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: select <[[NUM]] x i1> [[bvec1]], <[[NUM]] x i16> [[svec2]], <[[NUM]] x i16> [[svec3]]
+  sRes += select(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+
+  ibuf.Store<vector<int16_t, NUM> >(0, sRes);
+  ibuf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  ibuf.Store<vector<int, NUM> >(2048, iRes);
+  ibuf.Store<vector<uint, NUM> >(3072, uiRes);
+  ibuf.Store<vector<int64_t, NUM> >(4096, lRes);
+  ibuf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
new file mode 100644
index 0000000000..2ae3c92e85
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
@@ -0,0 +1,115 @@
+// RUN: %dxc -T lib_6_9 %s | FileCheck %s
+
+// Long vector tests for vec ops that scalarize to something more complex
+//  than a simple repetition of the same dx.op calls.
+
+// CHECK-LABEL: test_atan2
+// CHECK: fdiv fast <8 x float>
+// CHECK: call <8 x float> @dx.op.unary.v8f32(i32 17, <8 x float> %{{.*}}) ; Atan(value)
+// CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
+// CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
+// CHECK: fcmp fast olt <8 x float>
+// CHECK: fcmp fast oeq <8 x float>
+// CHECK: fcmp fast oge <8 x float>
+// CHECK: fcmp fast olt <8 x float>
+// CHECK: and <8 x i1>
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+// CHECK: and <8 x i1>
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+// CHECK: and <8 x i1>
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
+// CHECK: and <8 x i1>
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
+export void test_atan2(inout vector<float, 8> vec1, vector<float, 8> vec2) {
+  vec1 = atan2(vec1, vec2);
+}
+
+// CHECK-LABEL: test_fmod
+// CHECK: fdiv fast <8 x float>
+// CHECK: fsub fast <8 x float> <float
+// CHECK: fcmp fast oge <8 x float>
+// CHECK: call <8 x float> @dx.op.unary.v8f32(i32 6, <8 x float> %{{.*}}) ; FAbs(value)
+// CHECK: call <8 x float> @dx.op.unary.v8f32(i32 22, <8 x float> %{{.*}}) ; Frc(value)
+
+// CHECK: fsub fast <8 x float> <float
+// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+// CHECK: fmul fast <8 x float>
+export void test_fmod(inout vector<float, 8> vec1, vector<float, 8> vec2) {
+  vec1 = fmod(vec1, vec2);
+}
+
+// CHECK-LABEL: test_ldexp
+// CHECK: call <8 x float> @dx.op.unary.v8f32(i32 21, <8 x float> %{{.*}}) ; Exp(value)
+// CHECK: fmul fast <8 x float>
+
+export void test_ldexp(inout vector<float, 8> vec1, vector<float, 8> vec2) {
+  vec1 = ldexp(vec1, vec2);
+}
+
+
+// CHECK-LABEL: test_pow
+// CHECK: call <8 x float> @dx.op.unary.v8f32(i32 23, <8 x float> %{{.*}}) ; Log(value)
+// CHECK: fmul fast <8 x float>
+// CHECK: call <8 x float> @dx.op.unary.v8f32(i32 21, <8 x float> %{{.*}}) ; Exp(value)
+export void test_pow(inout vector<float, 8> vec1, vector<float, 8> vec2) {
+  vec1 = pow(vec1, vec2);
+}
+
+// CHECK-LABEL: test_modf
+// CHECK: call <8 x float>  @dx.op.unary.v8f32(i32 29, <8 x float>  %{{.*}}) ; Round_z(value)
+// CHECK: fsub fast <8 x float>
+export void test_modf(inout vector<float, 8> vec1, vector<float, 8> vec2) {
+  vec1 = modf(vec1, vec2);
+}
+
+// CHECK-LABEL: test_dot
+// CHECK: [[el:%.*]] = extractelement <8 x float>
+// CHECK: [[mul:%.*]] = fmul fast float [[el]]
+// CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mul]]) ; FMad(a,b,c)
+// CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+// CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+// CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+// CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+// CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+// CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+export void test_dot(inout vector<float, 8> vec1, vector<float, 8> vec2) {
+  vec1 = dot(vec1, vec2);
+}
+
+// CHECK-LABEL: test_any
+// CHECK: or i1
+// CHECK: or i1
+// CHECK: or i1
+// CHECK: or i1
+// CHECK: or i1
+// CHECK: or i1
+// CHECK: or i1
+export void test_any(vector<float, 8> vec1, inout vector<bool, 8> bvec) {
+  bvec &= any(vec1);
+}
+
+// CHECK-LABEL: test_all
+// CHECK: and i1
+// CHECK: and i1
+// CHECK: and i1
+// CHECK: and i1
+// CHECK: and i1
+// CHECK: and i1
+// CHECK: and i1
+export void test_all(vector<float, 8> vec1, inout vector<bool, 8> bvec) {
+  bvec &= all(vec1);
+}
+
+// CHECK-LABEL: test_WaveMatch
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+// call {{.*}} @dx.op.wave
+export uint4 test_WaveMatch(vector<bool, 8> bvec) {
+  return WaveMatch(bvec);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..02cad5b894
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl
@@ -0,0 +1,69 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=35 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=35 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=36 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=36 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled binary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]])
+  vector<float16_t, NUM> hRes = FUNC(hVec1, hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]])
+  vector<float, NUM> fRes = FUNC(fVec1, fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 [[OP]], <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]])
+  vector<double, NUM> dRes = FUNC(dVec1, dVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..994246b753
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl
@@ -0,0 +1,116 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=37 -DUOP=39 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=37 -DUOP=39 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=38 -DUOP=40 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=38 -DUOP=40 -DNUM=1022 %s | FileCheck %s
+
+#ifndef UOP
+#define UOP OP
+#endif
+
+// Test vector-enabled binary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode numbers.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 888, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(888, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[UOP:[0-9]*]]
+  buf.Store(999, UOP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = buf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = buf.Load<vector<int16_t, NUM> >(512);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = buf.Load<vector<uint16_t, NUM> >(1024);
+  vector<uint16_t, NUM> usVec2 = buf.Load<vector<uint16_t, NUM> >(1536);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = buf.Load<vector<int, NUM> >(2048);
+  vector<int, NUM> iVec2 = buf.Load<vector<int, NUM> >(2560);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = buf.Load<vector<uint, NUM> >(3072);
+  vector<uint, NUM> uiVec2 = buf.Load<vector<uint, NUM> >(3584);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = buf.Load<vector<int64_t, NUM> >(4096);
+  vector<int64_t, NUM> lVec2 = buf.Load<vector<int64_t, NUM> >(4608);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = buf.Load<vector<uint64_t, NUM> >(5120);
+  vector<uint64_t, NUM> ulVec2 = buf.Load<vector<uint64_t, NUM> >(5632);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]])
+  vector<int16_t, NUM> sRes = FUNC(sVec1, sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[UOP]], <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec1, usVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]])
+  vector<int, NUM> iRes = FUNC(iVec1, iVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[UOP]], <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]])
+  vector<uint, NUM> uiRes = FUNC(uiVec1, uiVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]])
+  vector<int64_t, NUM> lRes = FUNC(lVec1, lVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[UOP]], <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec1, ulVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
new file mode 100644
index 0000000000..6ebb511b00
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
@@ -0,0 +1,77 @@
+// The binary part of some of these is all just a vector math ops with as many unary dxops as elements.
+// These will have apparent mismatches between the ARITY define and the check prefix.
+
+// RUN: %dxc -DFUNC=f16tof32    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,LEGACY
+// RUN: %dxc -DFUNC=f32tof16    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,LEGACY
+// RUN: %dxc -DFUNC=isfinite    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=isinf       -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=isnan       -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=countbits   -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=firstbithigh -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=firstbitlow  -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=QuadReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossX        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossY        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossDiagonal -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=WaveActiveBitAnd       -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveBitOr        -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveBitXor       -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveProduct      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveSum          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveMin          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveMax          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitAnd  -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitOr   -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitXor  -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixProduct -DARITY=5 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixSum     -DARITY=5 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WavePrefixSum          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WavePrefixProduct      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveReadLaneFirst      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveAllEqual     -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+#if ARITY == 1
+#define CALLARGS(x,y,z) x
+#elif ARITY == 2
+#define CALLARGS(x,y,z) x, y
+#elif ARITY == 3
+#define CALLARGS(x,y,z) x, y, z
+// ARITY 4 is used for 1 vec + scalar
+#elif ARITY == 4
+#define CALLARGS(x,y,z) x, i
+// ARITY 5 is used for 1 vec + uint4 mask for wavemultiprefix*
+#elif ARITY == 5
+#define CALLARGS(x,y,z) x, m
+#endif
+
+StructuredBuffer< vector<TYPE, 8> > buf;
+ByteAddressBuffer rbuf;
+
+float4 main(uint i : SV_PrimitiveID, uint4 m : M) : SV_Target {
+  vector<TYPE, 8> arg1 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+  vector<TYPE, 8> arg2 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+  vector<TYPE, 8> arg3 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+
+  // UNARY: call {{.*}} [[DXOP:@dx.op.unary]]
+  // BINARY: call {{.*}} [[DXOP:@dx.op.binary]]
+  // TERTIARY: call {{.*}} [[DXOP:@dx.op.tertiary]]
+  // LEGACY: call {{.*}} [[DXOP:@dx.op.legacy]]
+  // SPECFLT: call {{.*}} [[DXOP:@dx.op.isSpecialFloat]]
+  // QUAD: call {{.*}} [[DXOP:@dx.op.quad]]
+  // WAVE: call {{.*}} [[DXOP:@dx.op.wave]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+
+  vector<TYPE, 8> ret = FUNC(CALLARGS(arg1, arg2, arg3));
+  return float4(ret[0] + ret[1], ret[2] + ret[3], ret[4] + ret[5], ret[6] + ret[7]);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..e32ebc1db2
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl
@@ -0,0 +1,86 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=46 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=46 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled ternary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+// Given that all we have at the moment are fmad and fma and the latter only takes doubles,
+// fma is tacked on as an additional check.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+  vector<float16_t, NUM> hVec3 = buf.Load<vector<float16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+  vector<float, NUM> fVec3 = buf.Load<vector<float, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+  vector<double, NUM> dVec3 = buf.Load<vector<double, NUM> >(5120);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.tertiary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]], <[[NUM]] x half> [[hvec3]])
+  vector<float16_t, NUM> hRes = FUNC(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.tertiary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]], <[[NUM]] x float> [[fvec3]])
+  vector<float, NUM> fRes = FUNC(fVec1, fVec2, fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.tertiary.[[DTY]](i32 [[OP]], <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]], <[[NUM]] x double> [[dvec3]])
+  vector<double, NUM> dRes = FUNC(dVec1, dVec2, dVec3);
+
+  // Tacked on fma() check since it only takes doubles.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.tertiary.[[DTY]](i32 47, <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]], <[[NUM]] x double> [[dvec3]])
+  vector<double, NUM> dRes2 = fma(dVec1, dVec2, dVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+  buf.Store<vector<double, NUM> >(5120, dRes2);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..50f98715e4
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl
@@ -0,0 +1,131 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=48 -DUOP=49 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=48 -DUOP=49 -DNUM=1022 %s | FileCheck %s
+
+#ifndef UOP
+#define UOP OP
+#endif
+
+// Test vector-enabled tertiary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode numbers.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 888, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(888, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[UOP:[0-9]*]]
+  buf.Store(999, UOP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[svec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = buf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = buf.Load<vector<int16_t, NUM> >(512);
+  vector<int16_t, NUM> sVec3 = buf.Load<vector<int16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1025
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[usvec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = buf.Load<vector<uint16_t, NUM> >(1025);
+  vector<uint16_t, NUM> usVec2 = buf.Load<vector<uint16_t, NUM> >(1536);
+  vector<uint16_t, NUM> usVec3 = buf.Load<vector<uint16_t, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2049
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[ivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = buf.Load<vector<int, NUM> >(2049);
+  vector<int, NUM> iVec2 = buf.Load<vector<int, NUM> >(2560);
+  vector<int, NUM> iVec3 = buf.Load<vector<int, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3073
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[uivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = buf.Load<vector<uint, NUM> >(3073);
+  vector<uint, NUM> uiVec2 = buf.Load<vector<uint, NUM> >(3584);
+  vector<uint, NUM> uiVec3 = buf.Load<vector<uint, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4097
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[lvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = buf.Load<vector<int64_t, NUM> >(4097);
+  vector<int64_t, NUM> lVec2 = buf.Load<vector<int64_t, NUM> >(4608);
+  vector<int64_t, NUM> lVec3 = buf.Load<vector<int64_t, NUM> >(5120);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5121
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 6144
+  // CHECK: [[ulvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = buf.Load<vector<uint64_t, NUM> >(5121);
+  vector<uint64_t, NUM> ulVec2 = buf.Load<vector<uint64_t, NUM> >(5632);
+  vector<uint64_t, NUM> ulVec3 = buf.Load<vector<uint64_t, NUM> >(6144);
+
+  // Test simple matching type overloads.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.tertiary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]], <[[NUM]] x i16> [[svec3]])
+  vector<int16_t, NUM> sRes = FUNC(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.tertiary.[[STY]](i32 [[UOP]], <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]], <[[NUM]] x i16> [[usvec3]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec1, usVec2, usVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.tertiary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]], <[[NUM]] x i32> [[ivec3]])
+  vector<int, NUM> iRes = FUNC(iVec1, iVec2, iVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.tertiary.[[ITY]](i32 [[UOP]], <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]], <[[NUM]] x i32> [[uivec3]])
+  vector<uint, NUM> uiRes = FUNC(uiVec1, uiVec2, uiVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.tertiary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]], <[[NUM]] x i64> [[lvec3]])
+  vector<int64_t, NUM> lRes = FUNC(lVec1, lVec2, lVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.tertiary.[[LTY]](i32 [[UOP]], <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]], <[[NUM]] x i64> [[ulvec3]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec1, ulVec2, ulVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..91ab631a7e
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
@@ -0,0 +1,83 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sin  -DOP=13 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sin  -DOP=13 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tan  -DOP=14 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tan  -DOP=14 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=acos -DOP=15 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=acos -DOP=15 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=asin -DOP=16 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=asin -DOP=16 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=atan -DOP=17 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=atan -DOP=17 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cosh -DOP=18 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cosh -DOP=18 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sinh -DOP=19 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sinh -DOP=19 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tanh -DOP=20 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tanh -DOP=20 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=exp2 -DOP=21 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=exp2 -DOP=21 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=frac -DOP=22 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=frac -DOP=22 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log2 -DOP=23 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log2 -DOP=23 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log10 -DOP=23 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log10 -DOP=23 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sqrt -DOP=24 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sqrt -DOP=24 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=rsqrt -DOP=25 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=rsqrt -DOP=25 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=round -DOP=26 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=round -DOP=26 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=floor -DOP=27 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=floor -DOP=27 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ceil -DOP=28 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ceil -DOP=28 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled unary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec = buf.Load<vector<float16_t, NUM> >(0);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[fvec:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec = buf.Load<vector<float, NUM> >(1024);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec]])
+  vector<float16_t, NUM> hRes = FUNC(hVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec]])
+  vector<float, NUM> fRes = FUNC(fVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(1024, fRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..ef0b250745
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl
@@ -0,0 +1,86 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=reversebits   -DOP=30 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=reversebits   -DOP=30 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled unary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // Capture opcode number.
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec = buf.Load<vector<int16_t, NUM> >(0);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[usvec:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec = buf.Load<vector<uint16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[ivec:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec = buf.Load<vector<int, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[uivec:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec = buf.Load<vector<uint, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[lvec:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec = buf.Load<vector<int64_t, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[ulvec:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec = buf.Load<vector<uint64_t, NUM> >(5120);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.unary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec]])
+  vector<int16_t, NUM> sRes = FUNC(sVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.unary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[usvec]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.unary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec]])
+  vector<int, NUM> iRes = FUNC(iVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.unary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[uivec]])
+  vector<uint, NUM> uiRes = FUNC(uiVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.unary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec]])
+  vector<int64_t, NUM> lRes = FUNC(lVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.unary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[ulvec]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
new file mode 100644
index 0000000000..11d705305d
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
@@ -0,0 +1,186 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=13   %s | FileCheck %s
+
+// Source for dxilgen test CodeGenDXIL/passes/longvec-intrinsics.ll.
+// Some targetted filecheck testing as an incidental.
+
+RWStructuredBuffer<vector<float16_t, NUM> > hBuf;
+RWStructuredBuffer<vector<float, NUM> > fBuf;
+RWStructuredBuffer<vector<double, NUM> > dBuf;
+
+RWStructuredBuffer<vector<bool, NUM> > bBuf;
+RWStructuredBuffer<vector<uint, NUM> > uBuf;
+RWStructuredBuffer<vector<int64_t, NUM> > lBuf;
+
+[numthreads(8,1,1)]
+void main() {
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 11, i32 0, i32 4) 
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 12, i32 0, i32 4) 
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 13, i32 0, i32 4) 
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  vector<float, NUM> fVec1 = fBuf[11];
+  vector<float, NUM> fVec2 = fBuf[12];
+  vector<float, NUM> fVec3 = fBuf[13];
+  
+  // CHECK: [[tmp:%.*]] = call <13 x float> @dx.op.binary.v13f32(i32 35, <13 x float> [[fvec1]], <13 x float> [[fvec2]])  ; FMax(a,b)
+  // CHECK: call <13 x float> @dx.op.binary.v13f32(i32 36, <13 x float> [[tmp]], <13 x float> [[fvec3]])  ; FMin(a,b)
+  vector<float, NUM> fRes = clamp(fVec1, fVec2, fVec3);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 14, i32 0, i32 2) 
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 15, i32 0, i32 2) 
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 16, i32 0, i32 2) 
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  vector<float16_t, NUM> hVec1 = hBuf[14];
+  vector<float16_t, NUM> hVec2 = hBuf[15];
+  vector<float16_t, NUM> hVec3 = hBuf[16];
+
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <13 x half> [[hvec2]], [[hvec1]]
+  // CHECK: select <13 x i1> [[tmp]], <13 x half> zeroinitializer, <13 x half> <half 0xH3C00
+  vector<float16_t, NUM> hRes = step(hVec1, hVec2);
+
+  // CHECK: [[tmp:%.*]] = fmul fast <13 x float> [[fvec1]], <float 0x
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 21, <13 x float> [[tmp]])  ; Exp(value)
+  fRes += exp(fVec1);
+
+  // CHECK: [[tmp:%.*]] = call <13 x half> @dx.op.unary.v13f16(i32 23, <13 x half> [[hvec1]])  ; Log(value)
+  // CHECK: fmul fast <13 x half> [[tmp]], <half 0xH398C
+  hRes += log(hVec1);
+
+  // CHECK: [[sub:%.*]] = fsub fast <13 x float> [[fvec2]], [[fvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <13 x float> [[fvec3]], [[fvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <13 x float> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <13 x float> @dx.op.unary.v13f32(i32 7, <13 x float> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <13 x float> [[sat]], <float 2.000000e+00,
+  // CHECK: [[sub:%.*]] = fsub fast <13 x float> <float 3.000000e+00, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <13 x float> [[sat]], [[sat]]
+  // CHECK: fmul fast <13 x float> [[mul]], [[sub]]
+  fRes += smoothstep(fVec1, fVec2, fVec3);
+
+  // Intrinsics that expand into llvm ops.
+
+  // CHECK: fmul fast <13 x float> [[fvec3]], <float 0x3F91DF46A0000000
+  fRes += radians(fVec3);
+
+  // CHECK: [[cmp:%.*]] = fcmp fast une <13 x float> [[fvec1]], zeroinitializer
+  // CHECK: [[f2i:%.*]] = bitcast <13 x float> [[fvec1]] to <13 x i32>
+  // CHECK: [[and:%.*]] = and <13 x i32> [[f2i]], <i32 2139095040
+  // CHECK: [[add:%.*]] = add nsw <13 x i32> [[and]], <i32 -1056964608
+  // CHECK: [[shr:%.*]] = ashr <13 x i32> [[add]], <i32 23
+  // CHECK: [[i2f:%.*]] = sitofp <13 x i32> [[shr]] to <13 x float>
+  // CHECK: [[sel:%.*]] = select <13 x i1> [[cmp]], <13 x float> [[i2f]], <13 x float> zeroinitializer
+  // CHECK: [[and:%.*]] = and <13 x i32> [[f2i]], <i32 8388607
+  // CHECK: or <13 x i32> [[and]], <i32 1056964608
+  vector<float, NUM> exp = fVec3;
+  fRes += frexp(fVec1, exp);
+  fRes += exp;
+
+  // CHECK: [[tmp:%.*]] = fsub fast <13 x half> [[hvec3]], [[hvec2]]
+  // CHECK: fmul fast <13 x half> [[tmp]], [[hvec1]]
+  hRes += lerp(hVec2, hVec3, hVec1);
+
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 17, i32 0, i32 4) 
+  // CHECK: [[uvec1:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 18, i32 0, i32 4) 
+  // CHECK: [[uvec2:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  vector<uint, NUM> uVec1 = uBuf[17];
+  vector<uint, NUM> uVec2 = uBuf[18];
+
+  vector<uint, NUM> signs = 1;
+  // CHECK: [[cmp:%.*]] = icmp ne <13 x i32> [[uvec2]], zeroinitializer
+  // CHECK: zext <13 x i1> [[cmp]] to <13 x i32>
+  signs *= sign(uVec2);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i64 @dx.op.rawBufferVectorLoad.v13i64(i32 303, %dx.types.Handle {{%.*}}, i32 19, i32 0, i32 8) 
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.v13i64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i64 @dx.op.rawBufferVectorLoad.v13i64(i32 303, %dx.types.Handle {{%.*}}, i32 20, i32 0, i32 8) 
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.v13i64 [[ld]], 0
+  vector<int64_t, NUM> lVec1 = lBuf[19];
+  vector<int64_t, NUM> lVec2 = lBuf[20];
+
+  // CHECK: [[gt:%.*]] = icmp sgt <13 x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <13 x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <13 x i1> [[gt]] to <13 x i32>
+  // CHECK: [[ilt:%.*]] = zext <13 x i1> [[lt]] to <13 x i32>
+  // CHECK: sub nsw <13 x i32> [[igt]], [[ilt]]
+  signs *= sign(lVec2);
+
+  vector<uint, NUM> uRes = signs;
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 21, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec1:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 22, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec2:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 23, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec3:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  vector<bool, NUM> bVec1 = bBuf[21];
+  vector<bool, NUM> bVec2 = bBuf[22];
+  vector<bool, NUM> bVec3 = bBuf[23];
+
+  // CHECK: [[bvec2:%.*]] = icmp ne <13 x i32> [[vec2]], zeroinitializer
+  // CHECK: [[bvec1:%.*]] = icmp ne <13 x i32> [[vec1]], zeroinitializer
+  // CHECK: or <13 x i1> [[bvec2]], [[bvec1]]
+  uRes += or(bVec1, bVec2);
+
+  // CHECK: [[bvec3:%.*]] = icmp ne <13 x i32> [[vec3]], zeroinitializer
+  // CHECK: and <13 x i1> [[bvec3]], [[bvec2]]
+  uRes += and(bVec2, bVec3);
+
+  // CHECK: select <13 x i1> [[bvec3]], <13 x i64> [[lvec1]], <13 x i64> [[lvec2]]
+  vector<int64_t, NUM> lRes = select(bVec3, lVec1, lVec2);
+
+  // CHECK: [[el1:%.*]] = extractelement <13 x float> [[fvec1]]
+  // CHECK: [[el2:%.*]] = extractelement <13 x float> [[fvec2]]
+  // CHECK: [[mul:%.*]] = fmul fast float [[el2]], [[el1]]
+  // CHECK: [[mad1:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mul]]) ; FMad(a,b,c)
+  // CHECK: [[mad2:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad1]]) ; FMad(a,b,c)
+  // CHECK: [[mad3:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad2]]) ; FMad(a,b,c)
+  // CHECK: [[mad4:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad3]]) ; FMad(a,b,c)
+  // CHECK: [[mad5:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad4]]) ; FMad(a,b,c)
+  // CHECK: [[mad6:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad5]]) ; FMad(a,b,c)
+  // CHECK: [[mad7:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad6]]) ; FMad(a,b,c)
+  // CHECK: [[mad8:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad7]]) ; FMad(a,b,c)
+  // CHECK: [[mad9:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad8]]) ; FMad(a,b,c)
+  // CHECK: [[mad10:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad9]]) ; FMad(a,b,c)
+  // CHECK: [[mad11:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad10]]) ; FMad(a,b,c)
+  // CHECK: [[mad12:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad11]]) ; FMad(a,b,c)
+  fRes += dot(fVec1, fVec2);
+
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 17, <13 x float> [[fvec1]])  ; Atan(value)
+  fRes += atan(fVec1);
+
+  // CHECK: call <13 x i32> @dx.op.binary.v13i32(i32 40, <13 x i32> [[uvec1]], <13 x i32> [[uvec2]])  ; UMin(a,b)
+  uRes += min(uVec1, uVec2);
+
+  // CHECK: call <13 x float> @dx.op.tertiary.v13f32(i32 46, <13 x float> [[fvec1]], <13 x float> [[fvec2]], <13 x float> [[fvec3]])  ; FMad(a,b,c)
+  fRes += mad(fVec1, fVec2, fVec3);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 24, i32 0, i32 8) 
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 25, i32 0, i32 8) 
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 26, i32 0, i32 8) 
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  vector<double, NUM> dVec1 = dBuf[24];
+  vector<double, NUM> dVec2 = dBuf[25];
+  vector<double, NUM> dVec3 = dBuf[26];
+
+  // CHECK: call <13 x double> @dx.op.tertiary.v13f64(i32 47, <13 x double> [[dvec1]], <13 x double> [[dvec2]], <13 x double> [[dvec3]])
+  vector<double, NUM> dRes = fma(dVec1, dVec2, dVec3);
+
+  hBuf[0] = hRes;
+  fBuf[0] = fRes;
+  dBuf[0] = dRes;
+  uBuf[0] = uRes;
+  lBuf[0] = lRes;
+}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll
new file mode 100644
index 0000000000..8f9dcbbdbc
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll
@@ -0,0 +1,434 @@
+; RUN: %dxopt %s -dxilgen -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%"class.RWStructuredBuffer<vector<half, 7> >" = type { <7 x half> }
+%"class.RWStructuredBuffer<vector<float, 7> >" = type { <7 x float> }
+%"class.RWStructuredBuffer<vector<double, 7> >" = type { <7 x double> }
+%"class.RWStructuredBuffer<vector<bool, 7> >" = type { <7 x i32> }
+%"class.RWStructuredBuffer<vector<unsigned int, 7> >" = type { <7 x i32> }
+%"class.RWStructuredBuffer<vector<long long, 7> >" = type { <7 x i64> }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" = external global %"class.RWStructuredBuffer<vector<half, 7> >", align 2
+@"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" = external global %"class.RWStructuredBuffer<vector<float, 7> >", align 4
+@"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" = external global %"class.RWStructuredBuffer<vector<double, 7> >", align 8
+@"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" = external global %"class.RWStructuredBuffer<vector<bool, 7> >", align 4
+@"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" = external global %"class.RWStructuredBuffer<vector<unsigned int, 7> >", align 4
+@"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" = external global %"class.RWStructuredBuffer<vector<long long, 7> >", align 8
+
+; CHECK-LABEL: define void @main()
+define void @main() #0 {
+bb:
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 11, i32 0, i32 4)
+  ; CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 12, i32 0, i32 4)
+  ; CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 13, i32 0, i32 4)
+  ; CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+
+  %exp = alloca <7 x float>, align 4
+  %tmp = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:23 col:30
+  %tmp1 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp) ; line:23 col:30
+  %tmp2 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp1, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:23 col:30
+  %tmp3 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp2, i32 11) ; line:23 col:30
+  %tmp4 = load <7 x float>, <7 x float>* %tmp3 ; line:23 col:30
+  %tmp5 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:24 col:30
+  %tmp6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp5) ; line:24 col:30
+  %tmp7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:24 col:30
+  %tmp8 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp7, i32 12) ; line:24 col:30
+  %tmp9 = load <7 x float>, <7 x float>* %tmp8 ; line:24 col:30
+  %tmp10 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:25 col:30
+  %tmp11 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp10) ; line:25 col:30
+  %tmp12 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp11, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:25 col:30
+  %tmp13 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp12, i32 13) ; line:25 col:30
+  %tmp14 = load <7 x float>, <7 x float>* %tmp13 ; line:25 col:30
+
+  ;  Clamp operation.
+  ; CHECK: [[max:%.*]] = call <7 x float> @dx.op.binary.v7f32(i32 35, <7 x float> [[fvec1]], <7 x float> [[fvec2]])
+  ; CHECK: call <7 x float> @dx.op.binary.v7f32(i32 36, <7 x float> [[max]], <7 x float> [[fvec3]])
+  %tmp15 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 119, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:29 col:29
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 14, i32 0, i32 2)
+  ; CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 15, i32 0, i32 2)
+  ; CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 16, i32 0, i32 2)
+  ; CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  %tmp16 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:37 col:34
+  %tmp17 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp16) ; line:37 col:34
+  %tmp18 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp17, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:37 col:34
+  %tmp19 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp18, i32 14) ; line:37 col:34
+  %tmp20 = load <7 x half>, <7 x half>* %tmp19 ; line:37 col:34
+  %tmp21 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:38 col:34
+  %tmp22 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp21) ; line:38 col:34
+  %tmp23 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp22, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:38 col:34
+  %tmp24 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp23, i32 15) ; line:38 col:34
+  %tmp25 = load <7 x half>, <7 x half>* %tmp24 ; line:38 col:34
+  %tmp26 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:39 col:34
+  %tmp27 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp26) ; line:39 col:34
+  %tmp28 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp27, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:39 col:34
+  %tmp29 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp28, i32 16) ; line:39 col:34
+  %tmp30 = load <7 x half>, <7 x half>* %tmp29 ; line:39 col:34
+
+  ; Step operation.
+  ; CHECK: [[cmp:%.*]] = fcmp fast olt <7 x half> [[hvec2]], [[hvec1]]
+  ; CHECK: select <7 x i1> [[cmp]], <7 x half> zeroinitializer, <7 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>
+  %tmp31 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>)"(i32 192, <7 x half> %tmp20, <7 x half> %tmp25) ; line:43 col:33
+
+  ;  Exp operation.
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, [[fvec1]]
+  ; CHECK call <7 x float> @dx.op.unary.v7f32(i32 21, <7 x float> [[mul]])
+  %tmp32 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 139, <7 x float> %tmp4) ; line:47 col:11
+  %tmp33 = fadd <7 x float> %tmp15, %tmp32 ; line:47 col:8
+
+  ;  Log operation.
+  ; CHECK: [[log:%.*]] = call <7 x half> @dx.op.unary.v7f16(i32 23, <7 x half> [[hvec1]])
+  ; CHECK: fmul fast <7 x half> <half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C>, [[log]]
+  %tmp34 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>)"(i32 159, <7 x half> %tmp20) ; line:51 col:11
+  %tmp35 = fadd <7 x half> %tmp31, %tmp34 ; line:51 col:8
+
+  ; Smoothstep operation.
+  ; CHECK: [[sub1:%.*]] = fsub fast <7 x float> [[fvec2]], [[fvec1]]
+  ; CHECK: [[sub2:%.*]] = fsub fast <7 x float> [[fvec3]], [[fvec1]]
+  ; CHECK: [[div:%.*]] = fdiv fast <7 x float> [[sub2]], [[sub1]]
+  ; CHECK: [[sat:%.*]] = call <7 x float> @dx.op.unary.v7f32(i32 7, <7 x float> [[div]])
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> [[sat]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  ; CHECK: [[sub:%.*]] = fsub fast <7 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, [[mul]]
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> [[sat]], [[sub]]
+  ; CHECK: fmul fast <7 x float> %Saturate, [[mul]]
+  %tmp36 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 189, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:61 col:11
+  %tmp37 = fadd <7 x float> %tmp33, %tmp36 ; line:61 col:8
+
+  ;  Radians operation.
+  ; CHECK: fmul fast <7 x float> <float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000>, [[fvec3]]
+  %tmp38 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 176, <7 x float> %tmp14) ; line:66 col:11
+  %tmp39 = fadd <7 x float> %tmp37, %tmp38 ; line:66 col:8
+  store <7 x float> %tmp14, <7 x float>* %exp, align 4 ; line:77 col:22
+
+  ;  Frexp operation.
+  ; CHECK: [[cmp:%.*]] = fcmp fast une <7 x float> [[fvec1]], zeroinitializer
+  ; CHECK: [[ext:%.*]] = sext <7 x i1> [[cmp]] to <7 x i32>
+  ; CHECK: [[bct:%.*]] = bitcast <7 x float> [[fvec1]] to <7 x i32>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[bct]], <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  ; CHECK: [[add:%.*]] = add <7 x i32> [[and]], <i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[add]], [[ext]]
+  ; CHECK: [[shr:%.*]] = ashr <7 x i32> [[and]], <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  ; CHECK: [[i2f:%.*]] = sitofp <7 x i32> [[shr]] to <7 x float>
+  ; CHECK: store <7 x float> [[i2f]], <7 x float>* %exp
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[bct]], <i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  ; CHECK: [[or:%.*]] = or <7 x i32> [[and]], <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[or]], [[ext]]
+  ; CHECK: bitcast <7 x i32> [[and]] to <7 x float>
+  %tmp41 = call <7 x float> @"dx.hl.op..<7 x float> (i32, <7 x float>, <7 x float>*)"(i32 150, <7 x float> %tmp4, <7 x float>* %exp) ; line:78 col:11
+  %tmp42 = fadd <7 x float> %tmp39, %tmp41 ; line:78 col:8
+  %tmp43 = load <7 x float>, <7 x float>* %exp, align 4 ; line:79 col:11
+  %tmp44 = fadd <7 x float> %tmp42, %tmp43 ; line:79 col:8
+
+  ;  Lerp operation.
+  ; CHECK: [[sub:%.*]] = fsub fast <7 x half> [[hvec3]], [[hvec2]]
+  ; CHECK: fmul fast <7 x half> [[hvec1]], [[sub]]
+  %tmp45 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>, <7 x half>)"(i32 157, <7 x half> %tmp25, <7 x half> %tmp30, <7 x half> %tmp20) ; line:83 col:11
+  %tmp46 = fadd <7 x half> %tmp35, %tmp45 ; line:83 col:8
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 17, i32 0, i32 4)
+  ; CHECK: [[uvec1:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 18, i32 0, i32 4)
+  ; CHECK: [[uvec2:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  %tmp47 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:90 col:29
+  %tmp48 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp47) ; line:90 col:29
+  %tmp49 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp48, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:90 col:29
+  %tmp50 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp49, i32 17) ; line:90 col:29
+  %tmp51 = load <7 x i32>, <7 x i32>* %tmp50 ; line:90 col:29
+  %tmp52 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:91 col:29
+  %tmp53 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp52) ; line:91 col:29
+  %tmp54 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp53, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:91 col:29
+  %tmp55 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp54, i32 18) ; line:91 col:29
+  %tmp56 = load <7 x i32>, <7 x i32>* %tmp55 ; line:91 col:29
+
+  ; Unsigned int sign operation.
+  ; CHECK: [[cmp:%.*]] = icmp ne <7 x i32> [[uvec2]], zeroinitializer
+  ; CHECK: zext <7 x i1> [[cmp]] to <7 x i32>
+  %tmp57 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>)"(i32 355, <7 x i32> %tmp56) ; line:96 col:12
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i64 @dx.op.rawBufferVectorLoad.v7i64(i32 303, %dx.types.Handle {{%.*}}, i32 19, i32 0, i32 8)
+  ; CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.v7i64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i64 @dx.op.rawBufferVectorLoad.v7i64(i32 303, %dx.types.Handle {{%.*}}, i32 20, i32 0, i32 8)
+  ; CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.v7i64 [[ld]], 0
+  %tmp58 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:102 col:32
+  %tmp59 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp58) ; line:102 col:32
+  %tmp60 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp59, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:102 col:32
+  %tmp61 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp60, i32 19) ; line:102 col:32
+  %tmp62 = load <7 x i64>, <7 x i64>* %tmp61 ; line:102 col:32
+  %tmp63 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:103 col:32
+  %tmp64 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp63) ; line:103 col:32
+  %tmp65 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp64, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:103 col:32
+  %tmp66 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp65, i32 20) ; line:103 col:32
+  %tmp67 = load <7 x i64>, <7 x i64>* %tmp66 ; line:103 col:32
+
+  ; Signed int sign operation.
+  ; CHECK: [[lt1:%.*]] = icmp slt <7 x i64> zeroinitializer, [[lvec2]]
+  ; CHECK: [[lt2:%.*]] = icmp slt <7 x i64> [[lvec2]], zeroinitializer
+  ; CHECK: [[ilt1:%.*]] = zext <7 x i1> [[lt1]] to <7 x i32>
+  ; CHECK: [[ilt2:%.*]] = zext <7 x i1> [[lt2]] to <7 x i32>
+  ; CHECK: sub <7 x i32> [[ilt1]], [[ilt2]]
+  %tmp68 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i64>)"(i32 185, <7 x i64> %tmp67) ; line:110 col:12
+  %tmp69 = mul <7 x i32> %tmp57, %tmp68 ; line:110 col:9
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 21, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec1:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 22, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec2:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 23, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec3:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  %tmp70 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:126 col:29
+  %tmp71 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp70) ; line:126 col:29
+  %tmp72 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp71, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:126 col:29
+  %tmp73 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp72, i32 21) ; line:126 col:29
+  %tmp74 = load <7 x i32>, <7 x i32>* %tmp73 ; line:126 col:29
+  %tmp75 = icmp ne <7 x i32> %tmp74, zeroinitializer ; line:126 col:29
+  %tmp76 = zext <7 x i1> %tmp75 to <7 x i32> ; line:126 col:21
+  %tmp77 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:127 col:29
+  %tmp78 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp77) ; line:127 col:29
+  %tmp79 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp78, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:127 col:29
+  %tmp80 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp79, i32 22) ; line:127 col:29
+  %tmp81 = load <7 x i32>, <7 x i32>* %tmp80 ; line:127 col:29
+  %tmp82 = icmp ne <7 x i32> %tmp81, zeroinitializer ; line:127 col:29
+  %tmp83 = zext <7 x i1> %tmp82 to <7 x i32> ; line:127 col:21
+  %tmp84 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:128 col:29
+  %tmp85 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp84) ; line:128 col:29
+  %tmp86 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp85, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:128 col:29
+  %tmp87 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp86, i32 23) ; line:128 col:29
+  %tmp88 = load <7 x i32>, <7 x i32>* %tmp87 ; line:128 col:29
+  %tmp89 = icmp ne <7 x i32> %tmp88, zeroinitializer ; line:128 col:29
+  %tmp90 = zext <7 x i1> %tmp89 to <7 x i32> ; line:128 col:21
+
+
+  ; Or() operation.
+  ; CHECK: [[bvec2:%.*]] = icmp ne <7 x i32> [[vec2]], zeroinitializer
+  ; CHECK: [[bvec1:%.*]] = icmp ne <7 x i32> [[vec1]], zeroinitializer
+  ; CHECK: or <7 x i1> [[bvec1]], [[bvec2]]
+  %tmp91 = icmp ne <7 x i32> %tmp83, zeroinitializer ; line:133 col:21
+  %tmp92 = icmp ne <7 x i32> %tmp76, zeroinitializer ; line:133 col:14
+  %tmp93 = call <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32 169, <7 x i1> %tmp92, <7 x i1> %tmp91) ; line:133 col:11
+  %tmp94 = zext <7 x i1> %tmp93 to <7 x i32> ; line:133 col:11
+  %tmp95 = add <7 x i32> %tmp69, %tmp94 ; line:133 col:8
+
+  ; And() operation.
+  ; CHECK: [[bvec3:%.*]] = icmp ne <7 x i32> [[vec3]], zeroinitializer
+  ; CHECK: [[bvec2:%.*]] = icmp ne <7 x i32> [[vec2]], zeroinitializer
+  ; CHECK: and <7 x i1> [[bvec2]], [[bvec3]]
+  %tmp96 = icmp ne <7 x i32> %tmp90, zeroinitializer ; line:137 col:22
+  %tmp97 = icmp ne <7 x i32> %tmp83, zeroinitializer ; line:137 col:15
+  %tmp98 = call <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32 106, <7 x i1> %tmp97, <7 x i1> %tmp96) ; line:137 col:11
+  %tmp99 = zext <7 x i1> %tmp98 to <7 x i32> ; line:137 col:11
+  %tmp100 = add <7 x i32> %tmp95, %tmp99 ; line:137 col:8
+
+  ; Select() operation.
+  ; CHECK: [[bvec3:%.*]] = icmp ne <7 x i32> [[vec3]], zeroinitializer
+  ; CHECK: select <7 x i1> [[bvec3]], <7 x i64> [[lvec1]], <7 x i64> [[lvec2]]
+  %tmp101 = icmp ne <7 x i32> %tmp90, zeroinitializer ; line:140 col:38
+  %tmp102 = call <7 x i64> @"dx.hl.op.rn.<7 x i64> (i32, <7 x i1>, <7 x i64>, <7 x i64>)"(i32 184, <7 x i1> %tmp101, <7 x i64> %tmp62, <7 x i64> %tmp67) ; line:140 col:31
+  %tmp103 = call float @"dx.hl.op.rn.float (i32, <7 x float>, <7 x float>)"(i32 134, <7 x float> %tmp4, <7 x float> %tmp9) ; line:152 col:11
+
+  ; Dot operation.
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 0
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 0
+  ; CHECK: [[mul:%.*]] = fmul fast float [[el1]], [[el2]]
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 1
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 1
+  ; CHECK: [[mad1:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mul]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 2
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 2
+  ; CHECK: [[mad2:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad1]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 3
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 3
+  ; CHECK: [[mad3:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad2]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 4
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 4
+  ; CHECK: [[mad4:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad3]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 5
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 5
+  ; CHECK: [[mad5:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad4]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 6
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 6
+  ; CHECK: call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad5]])
+  %tmp104 = insertelement <7 x float> undef, float %tmp103, i32 0 ; line:152 col:11
+  %tmp105 = shufflevector <7 x float> %tmp104, <7 x float> undef, <7 x i32> zeroinitializer ; line:152 col:11
+  %tmp106 = fadd <7 x float> %tmp44, %tmp105 ; line:152 col:8
+
+  ; Atan operation.
+  ; CHECK: call <7 x float> @dx.op.unary.v7f32(i32 17, <7 x float> [[fvec1]])
+  %tmp107 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 116, <7 x float> %tmp4) ; line:155 col:11
+  %tmp108 = fadd <7 x float> %tmp106, %tmp107 ; line:155 col:8
+
+  ; Min operation.
+  ; CHECK: call <7 x i32> @dx.op.binary.v7i32(i32 40, <7 x i32> [[uvec1]], <7 x i32> [[uvec2]])
+  %tmp109 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>, <7 x i32>)"(i32 353, <7 x i32> %tmp51, <7 x i32> %tmp56) ; line:158 col:11
+  %tmp110 = add <7 x i32> %tmp100, %tmp109 ; line:158 col:8
+
+  ; Mad operation.
+  ; CHECK: call <7 x float> @dx.op.tertiary.v7f32(i32 46, <7 x float> [[fvec1]], <7 x float> [[fvec2]], <7 x float> [[fvec3]])
+  %tmp111 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 162, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:161 col:11
+  %tmp112 = fadd <7 x float> %tmp108, %tmp111 ; line:161 col:8
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 24, i32 0, i32 8) 
+  ; CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 25, i32 0, i32 8) 
+  ; CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 26, i32 0, i32 8) 
+  ; CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  %tmp113 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:169 col:31
+  %tmp114 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp113) ; line:169 col:31
+  %tmp115 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp114, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:169 col:31
+  %tmp116 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp115, i32 24) ; line:169 col:31
+  %tmp117 = load <7 x double>, <7 x double>* %tmp116 ; line:169 col:31
+  %tmp118 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:170 col:31
+  %tmp119 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp118) ; line:170 col:31
+  %tmp120 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp119, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:170 col:31
+  %tmp121 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp120, i32 25) ; line:170 col:31
+  %tmp122 = load <7 x double>, <7 x double>* %tmp121 ; line:170 col:31
+  %tmp123 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:171 col:31
+  %tmp124 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp123) ; line:171 col:31
+  %tmp125 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp124, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:171 col:31
+  %tmp126 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp125, i32 26) ; line:171 col:31
+  %tmp127 = load <7 x double>, <7 x double>* %tmp126 ; line:171 col:31
+
+  ; FMA operation.
+  ; CHECK: call <7 x double> @dx.op.tertiary.v7f64(i32 47, <7 x double> [[dvec1]], <7 x double> [[dvec2]], <7 x double> [[dvec3]])
+  %tmp128 = call <7 x double> @"dx.hl.op.rn.<7 x double> (i32, <7 x double>, <7 x double>, <7 x double>)"(i32 147, <7 x double> %tmp117, <7 x double> %tmp122, <7 x double> %tmp127) ; line:174 col:30
+  %tmp129 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:176 col:3
+  %tmp130 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp129) ; line:176 col:3
+  %tmp131 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp130, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:176 col:3
+  %tmp132 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp131, i32 0) ; line:176 col:3
+  store <7 x half> %tmp46, <7 x half>* %tmp132 ; line:176 col:11
+  %tmp133 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:177 col:3
+  %tmp134 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp133) ; line:177 col:3
+  %tmp135 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp134, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:177 col:3
+  %tmp136 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp135, i32 0) ; line:177 col:3
+  store <7 x float> %tmp112, <7 x float>* %tmp136 ; line:177 col:11
+  %tmp137 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:178 col:3
+  %tmp138 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp137) ; line:178 col:3
+  %tmp139 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp138, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:178 col:3
+  %tmp140 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp139, i32 0) ; line:178 col:3
+  store <7 x double> %tmp128, <7 x double>* %tmp140 ; line:178 col:11
+  %tmp141 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:179 col:3
+  %tmp142 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp141) ; line:179 col:3
+  %tmp143 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp142, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:179 col:3
+  %tmp144 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp143, i32 0) ; line:179 col:3
+  store <7 x i32> %tmp110, <7 x i32>* %tmp144 ; line:179 col:11
+  %tmp145 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:180 col:3
+  %tmp146 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp145) ; line:180 col:3
+  %tmp147 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp146, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:180 col:3
+  %tmp148 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp147, i32 0) ; line:180 col:3
+  store <7 x i64> %tmp102, <7 x i64>* %tmp148 ; line:180 col:11
+  ret void ; line:181 col:1
+}
+
+declare <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<float, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<float, 7> >") #1
+declare <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32, <7 x float>, <7 x float>, <7 x float>) #1
+declare <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<half, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<half, 7> >") #1
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>)"(i32, <7 x half>, <7 x half>) #1
+declare <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32, <7 x float>) #1
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>)"(i32, <7 x half>) #1
+declare <7 x float> @"dx.hl.op..<7 x float> (i32, <7 x float>, <7 x float>*)"(i32, <7 x float>, <7 x float>*) #0
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>, <7 x half>)"(i32, <7 x half>, <7 x half>, <7 x half>) #1
+declare <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<unsigned int, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<unsigned int, 7> >") #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>)"(i32, <7 x i32>) #1
+declare <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<long long, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<long long, 7> >") #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i64>)"(i32, <7 x i64>) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<bool, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<bool, 7> >") #1
+declare <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32, <7 x i1>, <7 x i1>) #1
+declare <7 x i64> @"dx.hl.op.rn.<7 x i64> (i32, <7 x i1>, <7 x i64>, <7 x i64>)"(i32, <7 x i1>, <7 x i64>, <7 x i64>) #1
+declare float @"dx.hl.op.rn.float (i32, <7 x float>, <7 x float>)"(i32, <7 x float>, <7 x float>) #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>, <7 x i32>)"(i32, <7 x i32>, <7 x i32>) #1
+declare <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<double, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<double, 7> >") #1
+declare <7 x double> @"dx.hl.op.rn.<7 x double> (i32, <7 x double>, <7 x double>, <7 x double>)"(i32, <7 x double>, <7 x double>, <7 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!pauseresume = !{!1}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !36}
+!dx.entryPoints = !{!40}
+!dx.fnprops = !{!52}
+!dx.options = !{!53, !54}
+
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"cs", i32 6, i32 9}
+!5 = !{i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" undef, !6, %"class.RWStructuredBuffer<vector<float, 7> >" undef, !11, %"class.RWStructuredBuffer<vector<double, 7> >" undef, !16, %"class.RWStructuredBuffer<vector<bool, 7> >" undef, !21, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" undef, !26, %"class.RWStructuredBuffer<vector<long long, 7> >" undef, !31}
+!6 = !{i32 14, !7, !8}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 8, i32 13, i32 7}
+!8 = !{i32 0, !9}
+!9 = !{!10}
+!10 = !{i32 0, <7 x half> undef}
+!11 = !{i32 28, !12, !13}
+!12 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9, i32 13, i32 7}
+!13 = !{i32 0, !14}
+!14 = !{!15}
+!15 = !{i32 0, <7 x float> undef}
+!16 = !{i32 56, !17, !18}
+!17 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 10, i32 13, i32 7}
+!18 = !{i32 0, !19}
+!19 = !{!20}
+!20 = !{i32 0, <7 x double> undef}
+!21 = !{i32 28, !22, !23}
+!22 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 1, i32 13, i32 7}
+!23 = !{i32 0, !24}
+!24 = !{!25}
+!25 = !{i32 0, <7 x i1> undef}
+!26 = !{i32 28, !27, !28}
+!27 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5, i32 13, i32 7}
+!28 = !{i32 0, !29}
+!29 = !{!30}
+!30 = !{i32 0, <7 x i32> undef}
+!31 = !{i32 56, !32, !33}
+!32 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 6, i32 13, i32 7}
+!33 = !{i32 0, !34}
+!34 = !{!35}
+!35 = !{i32 0, <7 x i64> undef}
+!36 = !{i32 1, void ()* @main, !37}
+!37 = !{!38}
+!38 = !{i32 1, !39, !39}
+!39 = !{}
+!40 = !{void ()* @main, !"main", null, !41, null}
+!41 = !{null, !42, null, null}
+!42 = !{!43, !45, !47, !49, !50, !51}
+!43 = !{i32 0, %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A", !"hBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !44}
+!44 = !{i32 1, i32 14}
+!45 = !{i32 1, %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A", !"fBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!46 = !{i32 1, i32 28}
+!47 = !{i32 2, %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A", !"dBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !48}
+!48 = !{i32 1, i32 56}
+!49 = !{i32 3, %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A", !"bBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!50 = !{i32 4, %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A", !"uBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!51 = !{i32 5, %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A", !"lBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !48}
+!52 = !{void ()* @main, i32 5, i32 8, i32 1, i32 1}
+!53 = !{i32 0}
+!54 = !{i32 -1}
+!59 = !{!60, !60, i64 0}
+!60 = !{!"omnipotent char", !61, i64 0}
+!61 = !{!"Simple C/C++ TBAA"}
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 0008b752b1..a6cc52df1a 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -1503,7 +1503,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the " + i,
-                "hfd",
+                "hfd<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1537,7 +1537,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the " + i,
-                "hf",
+                "hf<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1554,7 +1554,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the reverse bit pattern of the input value",
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1601,7 +1601,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Binary",
                 "returns the " + i + " of the input values",
-                "hfd",
+                "hfd<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1619,7 +1619,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Binary",
                 "returns the " + i + " of the input values",
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1674,7 +1674,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Tertiary",
             "performs a fused multiply add (FMA) of the form a * b + c",
-            "hfd",
+            "hfd<",
             "rn",
             [
                 db_dxil_param(
@@ -1691,7 +1691,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Tertiary",
             "performs a fused multiply add (FMA) of the form a * b + c",
-            "d",
+            "d<",
             "rn",
             [
                 db_dxil_param(
@@ -1715,7 +1715,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Tertiary",
                 "performs an integral " + i,
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "the operation result"),
@@ -2608,7 +2608,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2626,7 +2626,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2644,7 +2644,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2662,7 +2662,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(