From 2bdf33e357e699d4fa088128760ebae0cebd9b7a Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Fri, 3 Jan 2025 14:25:22 -0700
Subject: [PATCH 1/6] Enable trivial native vector Dxil Operations plus a few

This enables the generation of native vector DXIL Operations
that are "trivial", meaning they take only a single DXOp Call
instruction to implement as well as a few others that either only took
such a call and some llvm operations or were of particular interest for
other reasons.

This involves allowing the overloads by adding the vector indication in
hctdb, altering the lowering to maintain the vectors instead of
scalarizing them, and a few sundry changes to fix issues along the way.

The "trivial" dxil operations that return a different value from the
overload type had to be moved out of the way and given their own
lowering function so that the main function could generate vectors
conditional on the version and vector type. These will be added in a
later change.

While the long vector supporting intrinsics that weren't given this
treatment will continue to generate scalarized operations, some of them
needed some work as well. The dot product for float vectors longer than
4 had to take the integer fallback path, which required some small
modificaitons and a rename.
Additionally, a heuristic for pow that malfunctioned with too many
elements had to have a limit placed on it.

Since the or()/and()/select() intrinsics translate directly to LLVM ops,
they can have their lowering scalarization removed and what future
scalarization might be needed by the current version can be done by
later passes as with other LLVM operators.

An issue with a special value used to represent unassined dimensions had
to be addressed since new dimensions can exceed that value. It's now
MAX_INT.

Contributes to #7120, but I'd prefer to leave it open until all
intrinsics are covered
---
 lib/HLSL/HLOperationLower.cpp                 | 241 ++++++-----
 tools/clang/lib/Sema/SemaHLSL.cpp             |   4 +-
 .../hlsl/types/longvec-intrinsics.hlsl        | 391 ++++++++++++++++++
 .../types/longvec-scalarized-intrinsics.hlsl  | 146 +++++++
 ...ngvec-trivial-binary-float-intrinsics.hlsl |  69 ++++
 ...longvec-trivial-binary-int-intrinsics.hlsl | 116 ++++++
 ...longvec-trivial-scalarized-intrinsics.hlsl |  87 ++++
 ...vec-trivial-tertiary-float-intrinsics.hlsl |  86 ++++
 ...ngvec-trivial-tertiary-int-intrinsics.hlsl | 131 ++++++
 ...ongvec-trivial-unary-float-intrinsics.hlsl |  83 ++++
 .../longvec-trivial-unary-int-intrinsics.hlsl |  86 ++++
 utils/hct/hctdb.py                            |  24 +-
 12 files changed, 1356 insertions(+), 108 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 4d8201df8d..8dda0230ba 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -424,6 +424,14 @@ struct IntrinsicLower {
 // IOP intrinsics.
 namespace {
 
+// Creates the necessary scalar calls to for a "trivial" operation where only
+// call instructions to a single function type are needed.
+// The overload type `Ty` determines what scalarization might be required.
+// Elements of any vectors in `refArgs` are extracted  into scalars for each
+// call generated while the same scalar values are used unaltered in each call.
+// Utility objects `HlslOp` and `Builder` are used to generate calls to the
+// given `DxilFunc` for each set of scalar arguments.
+// The results are reconstructed into the given `RetTy` as needed.
 Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode,
                             ArrayRef<Value *> refArgs, Type *Ty, Type *RetTy,
                             OP *hlslOP, IRBuilder<> &Builder) {
@@ -459,12 +467,40 @@ Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode,
     }
   }
 }
-// Generates a DXIL operation over an overloaded type (Ty), returning a
-// RetTy value; when Ty is a vector, it will replicate per-element operations
-// into RetTy to rebuild it.
+
+// Creates a native vector call to for a "trivial" operation where only a single
+// call instruction is needed. The overload and return types are the same vector
+// type `Ty`.
+// Utility objects `HlslOp` and `Builder` are used to create a call to the given
+// `DxilFunc` with `RefArgs` arguments.
+Value *TrivialDxilVectorOperation(Function *Func, OP::OpCode Opcode,
+                                  ArrayRef<Value *> Args, Type *Ty,
+                                  OP *OP, IRBuilder<> &Builder) {
+  if (!Ty->isVoidTy())
+    return Builder.CreateCall(Func, Args, OP->GetOpCodeName(Opcode));
+  else
+    return Builder.CreateCall(Func, Args); // Cannot add name to void.
+}
+
+// Generates a DXIL operation with the overloaded type based on `Ty` and return
+// type `RetTy`. When Ty is a vector, it will either generate per-element calls
+// for each vector element and reconstruct the vector type from those results or
+// operate on and return native vectors depending on vector size and the value
+// of `SupportsVectors`, which is deteremined by version and opcode support.
 Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
                             Type *Ty, Type *RetTy, OP *hlslOP,
-                            IRBuilder<> &Builder) {
+                            IRBuilder<> &Builder,
+                            bool SupportsVectors = false) {
+
+  // If supported and the overload type is a vector with more than 1 element,
+  // create a native vector operation.
+  if (SupportsVectors && Ty->isVectorTy() && Ty->getVectorNumElements() > 1) {
+    Function *dxilFunc = hlslOP->GetOpFunc(opcode, Ty);
+    return TrivialDxilVectorOperation(dxilFunc, opcode, refArgs, Ty, hlslOP,
+                                      Builder);
+  }
+
+  // Set overload type to the scalar type of `Ty` and generate call(s).
   Type *EltTy = Ty->getScalarType();
   Function *dxilFunc = hlslOP->GetOpFunc(opcode, EltTy);
 
@@ -484,43 +520,66 @@ Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
   return TrivialDxilOperation(opcode, refArgs, Ty, Inst->getType(), hlslOP, B);
 }
 
-Value *TrivialDxilUnaryOperationRet(OP::OpCode opcode, Value *src, Type *RetTy,
-                                    hlsl::OP *hlslOP, IRBuilder<> &Builder) {
-  Type *Ty = src->getType();
+// Translate call that converts to a dxil unary operation with a different
+// return type from the overload by passing the argument, explicit return type,
+// and helper objects to the scalarizing unary dxil operation creation.
+Value *TrivialUnaryOperationRet(CallInst *CI, IntrinsicOp IOP,
+                                OP::OpCode opcode,
+                                HLOperationLowerHelper &Helper,
+                                HLObjectOperationLowerHelper *pObjHelper,
+                                bool &Translated) {
+  Value *Src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Type *Ty = Src->getType();
 
-  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, src};
+  IRBuilder<> Builder(CI);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Type *RetTy = CI->getType();
+  Constant *opArg = OP->GetU32Const((unsigned)opcode);
+  Value *args[] = {opArg, Src};
 
-  return TrivialDxilOperation(opcode, args, Ty, RetTy, hlslOP, Builder);
+  return TrivialDxilOperation(opcode, args, Ty, RetTy, OP, Builder);
 }
 
 Value *TrivialDxilUnaryOperation(OP::OpCode opcode, Value *src,
-                                 hlsl::OP *hlslOP, IRBuilder<> &Builder) {
-  return TrivialDxilUnaryOperationRet(opcode, src, src->getType(), hlslOP,
-                                      Builder);
+                                 hlsl::OP *hlslOP, IRBuilder<> &Builder,
+                                 bool SupportsVectors = false) {
+  Type *Ty = src->getType();
+
+  Constant *OpArg = hlslOP->GetU32Const((unsigned)opcode);
+  Value *Args[] = {OpArg, src};
+
+  return TrivialDxilOperation(opcode, Args, Ty, Ty, hlslOP, Builder,
+                              SupportsVectors);
 }
 
 Value *TrivialDxilBinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
-                                  hlsl::OP *hlslOP, IRBuilder<> &Builder) {
+                                  hlsl::OP *hlslOP, IRBuilder<> &Builder,
+                                  bool SupportsVectors = false) {
   Type *Ty = src0->getType();
 
   Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
   Value *args[] = {opArg, src0, src1};
 
-  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder);
+  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder,
+                              SupportsVectors);
 }
 
 Value *TrivialDxilTrinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
                                    Value *src2, hlsl::OP *hlslOP,
-                                   IRBuilder<> &Builder) {
+                                   IRBuilder<> &Builder,
+                                   bool SupportsVectors = false) {
   Type *Ty = src0->getType();
 
   Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
   Value *args[] = {opArg, src0, src1, src2};
 
-  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder);
+  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder,
+                              SupportsVectors);
 }
 
+// Translate call that trivially converts to a dxil unary operation by passing
+// argument, return type, and helper objects to either scalarizing or native
+// vector dxil operation creation depending on version and vector size.
 Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                              HLOperationLowerHelper &helper,
                              HLObjectOperationLowerHelper *pObjHelper,
@@ -528,11 +587,14 @@ Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *src0 = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
   IRBuilder<> Builder(CI);
   hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *retVal = TrivialDxilUnaryOperationRet(opcode, src0, CI->getType(),
-                                               hlslOP, Builder);
-  return retVal;
+
+  return TrivialDxilUnaryOperation(opcode, src0, hlslOP, Builder,
+                                   helper.M.GetShaderModel()->IsSM69Plus());
 }
 
+// Translate call that trivially converts to a dxil binary operation by passing
+// arguments, return type, and helper objects to either scalarizing or native
+// vector dxil operation creation depending on version and vector size.
 Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                               HLOperationLowerHelper &helper,
                               HLObjectOperationLowerHelper *pObjHelper,
@@ -542,11 +604,14 @@ Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *src1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
   IRBuilder<> Builder(CI);
 
-  Value *binOp =
-      TrivialDxilBinaryOperation(opcode, src0, src1, hlslOP, Builder);
-  return binOp;
+  return TrivialDxilBinaryOperation(opcode, src0, src1, hlslOP, Builder,
+                                    helper.M.GetShaderModel()->IsSM69Plus());
 }
 
+// Translate call that trivially converts to a dxil trinary (aka tertiary)
+// operation by passing arguments, return type, and helper objects to either
+// scalarizing or native vector dxil operation creation depending on version
+// and vector size.
 Value *TrivialTrinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                                HLOperationLowerHelper &helper,
                                HLObjectOperationLowerHelper *pObjHelper,
@@ -557,9 +622,8 @@ Value *TrivialTrinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *src2 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
   IRBuilder<> Builder(CI);
 
-  Value *triOp =
-      TrivialDxilTrinaryOperation(opcode, src0, src1, src2, hlslOP, Builder);
-  return triOp;
+  return TrivialDxilTrinaryOperation(opcode, src0, src1, src2, hlslOP, Builder,
+                                     helper.M.GetShaderModel()->IsSM69Plus());
 }
 
 Value *TrivialIsSpecialFloat(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -738,6 +802,12 @@ bool CanUseFxcMulOnlyPatternForPow(IRBuilder<> &Builder, Value *x, Value *pow,
     }
   }
 
+  // Only apply on aggregates of 16 or fewer elements,
+  // representing the max 4x4 matrix size.
+  Type *Ty = x->getType();
+  if (Ty->isVectorTy() && Ty->getVectorNumElements() > 16)
+    return false;
+
   APFloat powAPF = isa<ConstantDataVector>(pow)
                        ? cast<ConstantDataVector>(pow)->getElementAsAPFloat(0)
                        : // should be a splat value
@@ -1447,6 +1517,7 @@ Value *TranslateWaveA2B(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *refArgs[] = {nullptr, CI->getOperand(1)};
   return TrivialDxilOperation(opcode, refArgs, helper.voidTy, CI, hlslOP);
 }
+
 // Wave ballot intrinsic.
 Value *TranslateWaveBallot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                            HLOperationLowerHelper &helper,
@@ -1899,9 +1970,11 @@ Value *TranslateClamp(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
   IRBuilder<> Builder(CI);
   // min(max(x, minVal), maxVal).
-  Value *maxXMinVal =
-      TrivialDxilBinaryOperation(maxOp, x, minVal, hlslOP, Builder);
-  return TrivialDxilBinaryOperation(minOp, maxXMinVal, maxVal, hlslOP, Builder);
+  bool SupportsVectors = helper.M.GetShaderModel()->IsSM69Plus();
+  Value *maxXMinVal = TrivialDxilBinaryOperation(maxOp, x, minVal, hlslOP,
+                                                 Builder, SupportsVectors);
+  return TrivialDxilBinaryOperation(minOp, maxXMinVal, maxVal, hlslOP, Builder,
+                                    SupportsVectors);
 }
 
 Value *TranslateClip(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2019,7 +2092,7 @@ Value *TranslateFirstbitHi(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                            HLObjectOperationLowerHelper *pObjHelper,
                            bool &Translated) {
   Value *firstbitHi =
-      TrivialUnaryOperation(CI, IOP, opcode, helper, pObjHelper, Translated);
+      TrivialUnaryOperationRet(CI, IOP, opcode, helper, pObjHelper, Translated);
   // firstbitHi == -1? -1 : (bitWidth-1 -firstbitHi);
   IRBuilder<> Builder(CI);
   Constant *neg1 = Builder.getInt32(-1);
@@ -2052,7 +2125,7 @@ Value *TranslateFirstbitLo(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                            HLObjectOperationLowerHelper *pObjHelper,
                            bool &Translated) {
   Value *firstbitLo =
-      TrivialUnaryOperation(CI, IOP, opcode, helper, pObjHelper, Translated);
+      TrivialUnaryOperationRet(CI, IOP, opcode, helper, pObjHelper, Translated);
   return firstbitLo;
 }
 
@@ -2214,8 +2287,9 @@ Value *TranslateExp(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
         ConstantVector::getSplat(Ty->getVectorNumElements(), log2eConst);
   }
   val = Builder.CreateFMul(log2eConst, val);
-  Value *exp = TrivialDxilUnaryOperation(OP::OpCode::Exp, val, hlslOP, Builder);
-  return exp;
+
+  return TrivialDxilUnaryOperation(OP::OpCode::Exp, val, hlslOP, Builder,
+                                   helper.M.GetShaderModel()->IsSM69Plus());
 }
 
 Value *TranslateLog(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2230,7 +2304,10 @@ Value *TranslateLog(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   if (Ty != Ty->getScalarType()) {
     ln2Const = ConstantVector::getSplat(Ty->getVectorNumElements(), ln2Const);
   }
-  Value *log = TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder);
+
+  Value *log =
+      TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder,
+                                helper.M.GetShaderModel()->IsSM69Plus());
 
   return Builder.CreateFMul(ln2Const, log);
 }
@@ -2248,7 +2325,9 @@ Value *TranslateLog10(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
     log2_10Const =
         ConstantVector::getSplat(Ty->getVectorNumElements(), log2_10Const);
   }
-  Value *log = TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder);
+  Value *log =
+      TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder,
+                                helper.M.GetShaderModel()->IsSM69Plus());
 
   return Builder.CreateFMul(log2_10Const, log);
 }
@@ -2431,17 +2510,18 @@ Value *TrivialDotOperation(OP::OpCode opcode, Value *src0, Value *src1,
   return dotOP;
 }
 
-Value *TranslateIDot(Value *arg0, Value *arg1, unsigned vecSize,
-                     hlsl::OP *hlslOP, IRBuilder<> &Builder,
-                     bool Unsigned = false) {
-  auto madOpCode = Unsigned ? DXIL::OpCode::UMad : DXIL::OpCode::IMad;
+// Instead of using a DXIL intrinsic, implement a dot product operation using
+// multiply and add operations. Used for integer dots and long vectors.
+Value *ExpandDot(Value *arg0, Value *arg1, unsigned vecSize, hlsl::OP *hlslOP,
+                 IRBuilder<> &Builder,
+                 DXIL::OpCode MadOpCode = DXIL::OpCode::IMad) {
   Value *Elt0 = Builder.CreateExtractElement(arg0, (uint64_t)0);
   Value *Elt1 = Builder.CreateExtractElement(arg1, (uint64_t)0);
   Value *Result = Builder.CreateMul(Elt0, Elt1);
-  for (unsigned iVecElt = 1; iVecElt < vecSize; ++iVecElt) {
-    Elt0 = Builder.CreateExtractElement(arg0, iVecElt);
-    Elt1 = Builder.CreateExtractElement(arg1, iVecElt);
-    Result = TrivialDxilTrinaryOperation(madOpCode, Elt0, Elt1, Result, hlslOP,
+  for (unsigned Elt = 1; Elt < vecSize; ++Elt) {
+    Elt0 = Builder.CreateExtractElement(arg0, Elt);
+    Elt1 = Builder.CreateExtractElement(arg1, Elt);
+    Result = TrivialDxilTrinaryOperation(MadOpCode, Elt0, Elt1, Result, hlslOP,
                                          Builder);
   }
 
@@ -2480,11 +2560,16 @@ Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   unsigned vecSize = Ty->getVectorNumElements();
   Value *arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
   IRBuilder<> Builder(CI);
-  if (Ty->getScalarType()->isFloatingPointTy()) {
+  Type *EltTy = Ty->getScalarType();
+  if (EltTy->isFloatingPointTy() && Ty->getVectorNumElements() <= 4) {
     return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
   } else {
-    return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
-                         IOP == IntrinsicOp::IOP_udot);
+    DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
+    if (IOP == IntrinsicOp::IOP_udot)
+      MadOpCode = DXIL::OpCode::UMad;
+    else if (EltTy->isFloatingPointTy())
+      MadOpCode = DXIL::OpCode::FMad;
+    return ExpandDot(arg0, arg1, vecSize, hlslOP, Builder, MadOpCode);
   }
 }
 
@@ -2601,8 +2686,9 @@ Value *TranslateSmoothStep(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *xSubMin = Builder.CreateFSub(x, minVal);
   Value *satVal = Builder.CreateFDiv(xSubMin, maxSubMin);
 
-  Value *s = TrivialDxilUnaryOperation(DXIL::OpCode::Saturate, satVal, hlslOP,
-                                       Builder);
+  Value *s =
+      TrivialDxilUnaryOperation(DXIL::OpCode::Saturate, satVal, hlslOP, Builder,
+                                helper.M.GetShaderModel()->IsSM69Plus());
   // return s * s *(3-2*s).
   Constant *c2 = ConstantFP::get(CI->getType(), 2);
   Constant *c3 = ConstantFP::get(CI->getType(), 3);
@@ -3032,8 +3118,10 @@ Value *TranslateMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
       if (arg0Ty->getScalarType()->isFloatingPointTy()) {
         return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
       } else {
-        return TranslateIDot(arg0, arg1, vecSize, hlslOP, Builder,
-                             IOP == IntrinsicOp::IOP_umul);
+        DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
+        if (IOP == IntrinsicOp::IOP_umul)
+          MadOpCode = DXIL::OpCode::UMad;
+        return ExpandDot(arg0, arg1, vecSize, hlslOP, Builder, MadOpCode);
       }
     } else {
       // mul(vector, scalar) == vector * scalar-splat
@@ -6150,20 +6238,8 @@ Value *TranslateAnd(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                     bool &Translated) {
   Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltX = Builder.CreateExtractElement(x, i);
-      Value *EltY = Builder.CreateExtractElement(y, i);
-      Value *tmp = Builder.CreateAnd(EltX, EltY);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateAnd(x, y);
 }
 Value *TranslateOr(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -6171,20 +6247,8 @@ Value *TranslateOr(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
                    HLObjectOperationLowerHelper *pObjHelper, bool &Translated) {
   Value *x = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Value *y = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltX = Builder.CreateExtractElement(x, i);
-      Value *EltY = Builder.CreateExtractElement(y, i);
-      Value *tmp = Builder.CreateOr(EltX, EltY);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateOr(x, y);
 }
 Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -6194,21 +6258,8 @@ Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   Value *cond = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
   Value *t = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
   Value *f = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
-  Type *Ty = CI->getType();
-  Type *EltTy = Ty->getScalarType();
   IRBuilder<> Builder(CI);
 
-  if (Ty != EltTy) {
-    Value *Result = UndefValue::get(Ty);
-    for (unsigned i = 0; i < Ty->getVectorNumElements(); i++) {
-      Value *EltCond = Builder.CreateExtractElement(cond, i);
-      Value *EltTrue = Builder.CreateExtractElement(t, i);
-      Value *EltFalse = Builder.CreateExtractElement(f, i);
-      Value *tmp = Builder.CreateSelect(EltCond, EltTrue, EltFalse);
-      Result = Builder.CreateInsertElement(Result, tmp, i);
-    }
-    return Result;
-  }
   return Builder.CreateSelect(cond, t, f);
 }
 } // namespace
@@ -6467,18 +6518,20 @@ IntrinsicLower gLowerTable[] = {
     {IntrinsicOp::IOP_clip, TranslateClip, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_cos, TrivialUnaryOperation, DXIL::OpCode::Cos},
     {IntrinsicOp::IOP_cosh, TrivialUnaryOperation, DXIL::OpCode::Hcos},
-    {IntrinsicOp::IOP_countbits, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_countbits, TrivialUnaryOperationRet,
      DXIL::OpCode::Countbits},
     {IntrinsicOp::IOP_cross, TranslateCross, DXIL::OpCode::NumOpCodes},
-    {IntrinsicOp::IOP_ddx, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddx, TrivialUnaryOperationRet,
+     DXIL::OpCode::DerivCoarseX},
+    {IntrinsicOp::IOP_ddx_coarse, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivCoarseX},
-    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddx_fine, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivFineX},
-    {IntrinsicOp::IOP_ddy, TrivialUnaryOperation, DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddy, TrivialUnaryOperationRet,
+     DXIL::OpCode::DerivCoarseY},
+    {IntrinsicOp::IOP_ddy_coarse, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivCoarseY},
-    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperation,
+    {IntrinsicOp::IOP_ddy_fine, TrivialUnaryOperationRet,
      DXIL::OpCode::DerivFineY},
     {IntrinsicOp::IOP_degrees, TranslateDegrees, DXIL::OpCode::NumOpCodes},
     {IntrinsicOp::IOP_determinant, EmptyLower, DXIL::OpCode::NumOpCodes},
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index 027d7d3cbc..3dac550218 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -6606,7 +6606,7 @@ bool HLSLExternalSource::MatchArguments(
   argTypes.clear();
   const bool isVariadic = IsVariadicIntrinsicFunction(pIntrinsic);
 
-  static const UINT UnusedSize = 0xFF;
+  static const UINT UnusedSize = UINT_MAX;
   static const BYTE MaxIntrinsicArgs = g_MaxIntrinsicParamCount + 1;
 #define CAB(cond, arg)                                                         \
   {                                                                            \
@@ -6622,7 +6622,7 @@ bool HLSLExternalSource::MatchArguments(
   ArBasicKind
       ComponentType[MaxIntrinsicArgs]; // Component type for each argument,
                                        // AR_BASIC_UNKNOWN if unspecified.
-  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UNUSED_INDEX32
+  UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UnusedSize
                                        // if unspecified.
   badArgIdx = MaxIntrinsicArgs;
 
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
new file mode 100644
index 0000000000..af6f96745c
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl
@@ -0,0 +1,391 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=7   %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=125 %s | FileCheck %s
+
+// Test vector-enabled non-trivial intrinsics that take parameters of various types.
+
+RWByteAddressBuffer buf;
+RWByteAddressBuffer ibuf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle {{%.*}}, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+  vector<float16_t, NUM> hVec3 = buf.Load<vector<float16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+  vector<float, NUM> fVec3 = buf.Load<vector<float, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+  vector<double, NUM> dVec3 = buf.Load<vector<double, NUM> >(5120);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle {{%.*}}, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[svec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = ibuf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = ibuf.Load<vector<int16_t, NUM> >(512);
+  vector<int16_t, NUM> sVec3 = ibuf.Load<vector<int16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1025
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[usvec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = ibuf.Load<vector<uint16_t, NUM> >(1025);
+  vector<uint16_t, NUM> usVec2 = ibuf.Load<vector<uint16_t, NUM> >(1536);
+  vector<uint16_t, NUM> usVec3 = ibuf.Load<vector<uint16_t, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2049
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[ivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = ibuf.Load<vector<int, NUM> >(2049);
+  vector<int, NUM> iVec2 = ibuf.Load<vector<int, NUM> >(2560);
+  vector<int, NUM> iVec3 = ibuf.Load<vector<int, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3073
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[uivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = ibuf.Load<vector<uint, NUM> >(3073);
+  vector<uint, NUM> uiVec2 = ibuf.Load<vector<uint, NUM> >(3584);
+  vector<uint, NUM> uiVec3 = ibuf.Load<vector<uint, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4097
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[lvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = ibuf.Load<vector<int64_t, NUM> >(4097);
+  vector<int64_t, NUM> lVec2 = ibuf.Load<vector<int64_t, NUM> >(4608);
+  vector<int64_t, NUM> lVec3 = ibuf.Load<vector<int64_t, NUM> >(5120);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5121
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 6144
+  // CHECK: [[ulvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = ibuf.Load<vector<uint64_t, NUM> >(5121);
+  vector<uint64_t, NUM> ulVec2 = ibuf.Load<vector<uint64_t, NUM> >(5632);
+  vector<uint64_t, NUM> ulVec3 = ibuf.Load<vector<uint64_t, NUM> >(6144);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 35, <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 36, <[[NUM]] x half> [[tmp]], <[[NUM]] x half> [[hvec3]])  ; FMin(a,b)
+  vector<float16_t, NUM> hRes = clamp(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 35, <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 36, <[[NUM]] x float> [[tmp]], <[[NUM]] x float> [[fvec3]])  ; FMin(a,b)
+  vector<float, NUM> fRes = clamp(fVec1, fVec2, fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 35, <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]])  ; FMax(a,b)
+  // CHECK: call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 36, <[[NUM]] x double> [[tmp]], <[[NUM]] x double> [[dvec3]])  ; FMin(a,b)
+  vector<double, NUM> dRes = clamp(dVec1, dVec2, dVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 37, <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 38, <[[NUM]] x i16> [[tmp]], <[[NUM]] x i16> [[svec3]])  ; IMin(a,b)
+  vector<int16_t, NUM> sRes = clamp(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 39, <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 40, <[[NUM]] x i16> [[tmp]], <[[NUM]] x i16> [[usvec3]])  ; UMin(a,b)
+  vector<uint16_t, NUM> usRes = clamp(usVec1, usVec2, usVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 37, <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 38, <[[NUM]] x i32> [[tmp]], <[[NUM]] x i32> [[ivec3]])  ; IMin(a,b)
+  vector<int, NUM> iRes = clamp(iVec1, iVec2, iVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 39, <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 40, <[[NUM]] x i32> [[tmp]], <[[NUM]] x i32> [[uivec3]])  ; UMin(a,b)
+  vector<uint, NUM> uiRes = clamp(uiVec1, uiVec2, uiVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 37, <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]])  ; IMax(a,b)
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 38, <[[NUM]] x i64> [[tmp]], <[[NUM]] x i64> [[lvec3]])  ; IMin(a,b)
+  vector<int64_t, NUM> lRes = clamp(lVec1, lVec2, lVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 39, <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]])  ; UMax(a,b)
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 40, <[[NUM]] x i64> [[tmp]], <[[NUM]] x i64> [[ulvec3]])  ; UMin(a,b)
+  vector<uint64_t, NUM> ulRes = clamp(ulVec1, ulVec2, ulVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <[[NUM]] x half> [[hvec2]], [[hvec1]]
+  // CHECK: select <[[NUM]] x i1> [[tmp]], <[[NUM]] x half> zeroinitializer, <[[NUM]] x half> <half 0xH3C00
+  hRes += step(hVec1, hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <[[NUM]] x float> [[fvec2]], [[fvec1]]
+  // CHECK: select <[[NUM]] x i1> [[tmp]], <[[NUM]] x float> zeroinitializer, <[[NUM]] x float> <float 1
+  fRes += step(fVec1, fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fmul fast <[[NUM]] x half> [[hvec1]], <half 0x
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 21, <[[NUM]] x half> [[tmp]])  ; Exp(value)
+  hRes += exp(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fmul fast <[[NUM]] x float> [[fvec1]], <float 0x
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 21, <[[NUM]] x float> [[tmp]])  ; Exp(value)
+  fRes += exp(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 23, <[[NUM]] x half> [[hvec1]])  ; Log(value)
+  // CHECK: fmul fast <[[NUM]] x half> [[tmp]], <half 0xH398C
+  hRes += log(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 23, <[[NUM]] x float> [[fvec1]])  ; Log(value)
+  // CHECK: fmul fast <[[NUM]] x float> [[tmp]], <float 0x3FE62E4300000000
+  fRes += log(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> [[hvec2]], [[hvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <[[NUM]] x half> [[hvec3]], [[hvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <[[NUM]] x half> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 7, <[[NUM]] x half> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x half> [[sat]], <half 0xH4000,
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x half> <half 0xH4200, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x half> [[sat]], [[sat]]
+  // CHECK: fmul fast <[[NUM]] x half> [[mul]], [[sub]]
+  hRes += smoothstep(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x float> [[fvec2]], [[fvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <[[NUM]] x float> [[fvec3]], [[fvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <[[NUM]] x float> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 7, <[[NUM]] x float> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x float> [[sat]], <float 2.000000e+00,
+  // CHECK: [[sub:%.*]] = fsub fast <[[NUM]] x float> <float 3.000000e+00, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <[[NUM]] x float> [[sat]], [[sat]]
+  // CHECK: fmul fast <[[NUM]] x float> [[mul]], [[sub]]
+  fRes += smoothstep(fVec1, fVec2, fVec3);
+
+  // Intrinsics that expand into llvm ops.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x half> [[hvec2]], <half 0xH5329
+  hRes += degrees(hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x float> [[fvec2]], <float 0x404CA5DC20000000
+  fRes += degrees(fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x half> [[hvec3]], <half 0xH2478
+  hRes += radians(hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fmul fast <[[NUM]] x float> [[fvec3]], <float 0x3F91DF46A0000000
+  fRes += radians(fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = fcmp fast une <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[f2i:%.*]] = bitcast <[[NUM]] x float> [[fvec1]] to <[[NUM]] x i32>
+  // CHECK: [[and:%.*]] = and <[[NUM]] x i32> [[f2i]], <i32 2139095040
+  // CHECK: [[add:%.*]] = add nsw <[[NUM]] x i32> [[and]], <i32 -1056964608
+  // CHECK: [[shr:%.*]] = ashr <[[NUM]] x i32> [[add]], <i32 23
+  // CHECK: [[i2f:%.*]] = sitofp <[[NUM]] x i32> [[shr]] to <[[NUM]] x float>
+  // CHECK: [[sel:%.*]] = select <[[NUM]] x i1> [[cmp]], <[[NUM]] x float> [[i2f]], <[[NUM]] x float> zeroinitializer
+  // CHECK: [[and:%.*]] = and <[[NUM]] x i32> [[f2i]], <i32 8388607
+  // CHECK: or <[[NUM]] x i32> [[and]], <i32 1056964608
+  vector<float, NUM> exp = fVec3;
+  fRes += frexp(fVec1, exp);
+  fRes += exp;
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fsub fast <[[NUM]] x half> [[hvec3]], [[hvec2]]
+  // CHECK: fmul fast <[[NUM]] x half> [[tmp]], [[hvec1]]
+  hRes += lerp(hVec2, hVec3, hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[tmp:%.*]] = fsub fast <[[NUM]] x float> [[fvec3]], [[fvec2]]
+  // CHECK: fmul fast <[[NUM]] x float> [[tmp]], [[fvec1]]
+  fRes += lerp(fVec2, fVec3, fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fdiv fast <[[NUM]] x half> <half 0xH3C00, {{.*}}>, [[hvec1]]
+  hRes += rcp(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: fdiv fast <[[NUM]] x float> <float 1.000000e+00, {{.*}}>, [[fvec1]]
+  fRes += rcp(fVec1);
+
+  vector<uint, NUM> signs = 1;
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x half> [[hvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x half> [[hvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(hVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x float> [[fvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(fVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = fcmp fast ogt <[[NUM]] x double> [[dvec1]], zeroinitializer
+  // CHECK: [[lt:%.*]] = fcmp fast olt <[[NUM]] x double> [[dvec1]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(dVec1);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i16> [[usvec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(usVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i32> [[ivec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i32> [[ivec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: [[sub:%.*]] = sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(iVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i32> [[uivec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(uiVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[gt:%.*]] = icmp sgt <[[NUM]] x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <[[NUM]] x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <[[NUM]] x i1> [[gt]] to <[[NUM]] x i32>
+  // CHECK: [[ilt:%.*]] = zext <[[NUM]] x i1> [[lt]] to <[[NUM]] x i32>
+  // CHECK: sub nsw <[[NUM]] x i32> [[igt]], [[ilt]]
+  signs *= sign(lVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[cmp:%.*]] = icmp ne <[[NUM]] x i64> [[ulvec2]], zeroinitializer
+  // CHECK: zext <[[NUM]] x i1> [[cmp]] to <[[NUM]] x i32>
+  signs *= sign(ulVec2);
+
+  iRes += signs;
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[bvec2:%.*]] = icmp ne <[[NUM]] x i16> [[svec2]], zeroinitializer
+  // CHECK: [[bvec1:%.*]] = icmp ne <[[NUM]] x i16> [[svec1]], zeroinitializer
+  // CHECK: or <[[NUM]] x i1> [[bvec2]], [[bvec1]]
+  sRes += or(sVec1, sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: [[bvec3:%.*]] = icmp ne <[[NUM]] x i16> [[svec3]], zeroinitializer
+  // CHECK: and <[[NUM]] x i1> [[bvec3]], [[bvec2]]
+  sRes += and(sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: select <[[NUM]] x i1> [[bvec1]], <[[NUM]] x i16> [[svec2]], <[[NUM]] x i16> [[svec3]]
+  sRes += select(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+
+  ibuf.Store<vector<int16_t, NUM> >(0, sRes);
+  ibuf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  ibuf.Store<vector<int, NUM> >(2048, iRes);
+  ibuf.Store<vector<uint, NUM> >(3072, uiRes);
+  ibuf.Store<vector<int64_t, NUM> >(4096, lRes);
+  ibuf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
new file mode 100644
index 0000000000..4886f04e01
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
@@ -0,0 +1,146 @@
+// RUN: %dxc -T ps_6_9 %s | FileCheck %s
+
+// Long vector tests for vec ops that scalarize to something more complex
+//  than a simple repetition of the same dx.op calls.
+
+StructuredBuffer< vector<float, 8> > buf;
+ByteAddressBuffer rbuf;
+
+float4 main(uint i : SV_PrimitiveID, bool b : B) : SV_Target {
+  vector<float, 8> vec1 = rbuf.Load< vector<float, 8> >(i++*32);
+  vector<float, 8> vec2 = rbuf.Load< vector<float, 8> >(i++*32);
+  vector<float, 8> vec3 = rbuf.Load< vector<float, 8> >(i++*32);
+
+  // CHECK: fdiv fast <8 x float>
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
+  // CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
+  // CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
+  // CHECK: fcmp fast olt <8 x float>
+  // CHECK: fcmp fast oeq <8 x float>
+  // CHECK: fcmp fast oge <8 x float>
+  // CHECK: fcmp fast olt <8 x float>
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
+  // CHECK: and <8 x i1>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
+  vec1 = atan2(vec1, vec2);
+
+
+  // CHECK: fdiv fast <8 x float>
+  // CHECK: fsub fast <8 x float> <float
+  // CHECK: fcmp fast oge <8 x float>
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+  // CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
+
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+  // CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
+
+  // CHECK: fsub fast <8 x float> <float
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
+  // CHECK: fmul fast <8 x float>
+  vec1 = fmod(vec1, vec2);
+
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: fmul fast <8 x float>
+  vec1 = ldexp(vec1, vec2);
+
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
+  // CHECK: fmul fast <8 x float>
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  // CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
+  vec1 = pow(vec1, vec2);
+
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
+  // CHECK: fsub fast <8 x float>
+  vec1 = modf(vec1, vec2);
+
+  // CHECK: fmul fast float
+  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  vec1 = dot(vec1, vec2);
+
+  vector<bool, 8> bvec = b;
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  // CHECK: or i1
+  bvec &= any(vec1);
+
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  // CHECK: and i1
+  bvec &= all(vec2);
+
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  // call {{.*}} @dx.op.wave
+  return WaveMatch(bvec);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..02cad5b894
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-float-intrinsics.hlsl
@@ -0,0 +1,69 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=35 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=35 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=36 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=36 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled binary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]])
+  vector<float16_t, NUM> hRes = FUNC(hVec1, hVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]])
+  vector<float, NUM> fRes = FUNC(fVec1, fVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 [[OP]], <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]])
+  vector<double, NUM> dRes = FUNC(dVec1, dVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..994246b753
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-binary-int-intrinsics.hlsl
@@ -0,0 +1,116 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=37 -DUOP=39 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max   -DOP=37 -DUOP=39 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=38 -DUOP=40 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min   -DOP=38 -DUOP=40 -DNUM=1022 %s | FileCheck %s
+
+#ifndef UOP
+#define UOP OP
+#endif
+
+// Test vector-enabled binary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode numbers.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 888, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(888, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[UOP:[0-9]*]]
+  buf.Store(999, UOP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = buf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = buf.Load<vector<int16_t, NUM> >(512);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = buf.Load<vector<uint16_t, NUM> >(1024);
+  vector<uint16_t, NUM> usVec2 = buf.Load<vector<uint16_t, NUM> >(1536);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = buf.Load<vector<int, NUM> >(2048);
+  vector<int, NUM> iVec2 = buf.Load<vector<int, NUM> >(2560);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = buf.Load<vector<uint, NUM> >(3072);
+  vector<uint, NUM> uiVec2 = buf.Load<vector<uint, NUM> >(3584);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = buf.Load<vector<int64_t, NUM> >(4096);
+  vector<int64_t, NUM> lVec2 = buf.Load<vector<int64_t, NUM> >(4608);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = buf.Load<vector<uint64_t, NUM> >(5120);
+  vector<uint64_t, NUM> ulVec2 = buf.Load<vector<uint64_t, NUM> >(5632);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]])
+  vector<int16_t, NUM> sRes = FUNC(sVec1, sVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[UOP]], <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec1, usVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]])
+  vector<int, NUM> iRes = FUNC(iVec1, iVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[UOP]], <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]])
+  vector<uint, NUM> uiRes = FUNC(uiVec1, uiVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]])
+  vector<int64_t, NUM> lRes = FUNC(lVec1, lVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[UOP]], <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec1, ulVec2);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
new file mode 100644
index 0000000000..40ffd3fe63
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-scalarized-intrinsics.hlsl
@@ -0,0 +1,87 @@
+// The binary part of some of these is all just a vector math ops with as many unary dxops as elements.
+// These will have apparent mismatches between the ARITY define and the check prefix.
+
+// RUN: %dxc -DFUNC=abs         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=pow         -DARITY=2 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=f16tof32    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,LEGACY
+// RUN: %dxc -DFUNC=f32tof16    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,LEGACY
+// RUN: %dxc -DFUNC=isfinite    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=isinf       -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=isnan       -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,SPECFLT
+// RUN: %dxc -DFUNC=modf        -DARITY=2 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=countbits   -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=firstbithigh -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=firstbitlow  -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddx_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy         -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_coarse  -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=ddy_fine    -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=fwidth      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,UNARY
+// RUN: %dxc -DFUNC=QuadReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossX        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossY        -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=QuadReadAcrossDiagonal -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,QUAD
+// RUN: %dxc -DFUNC=WaveActiveBitAnd       -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveBitOr        -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveBitXor       -DARITY=1 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveProduct      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveSum          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveMin          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveMax          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitAnd  -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitOr   -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixBitXor  -DARITY=5 -DTYPE=uint -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixProduct -DARITY=5 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveMultiPrefixSum     -DARITY=5 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WavePrefixSum          -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WavePrefixProduct      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveReadLaneAt         -DARITY=4 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveReadLaneFirst      -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+// RUN: %dxc -DFUNC=WaveActiveAllEqual     -DARITY=1 -T ps_6_9 %s | FileCheck %s --check-prefixes=CHECK,WAVE
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+#if ARITY == 1
+#define CALLARGS(x,y,z) x
+#elif ARITY == 2
+#define CALLARGS(x,y,z) x, y
+#elif ARITY == 3
+#define CALLARGS(x,y,z) x, y, z
+// ARITY 4 is used for 1 vec + scalar
+#elif ARITY == 4
+#define CALLARGS(x,y,z) x, i
+// ARITY 5 is used for 1 vec + uint4 mask for wavemultiprefix*
+#elif ARITY == 5
+#define CALLARGS(x,y,z) x, m
+#endif
+
+StructuredBuffer< vector<TYPE, 8> > buf;
+ByteAddressBuffer rbuf;
+
+float4 main(uint i : SV_PrimitiveID, uint4 m : M) : SV_Target {
+  vector<TYPE, 8> arg1 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+  vector<TYPE, 8> arg2 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+  vector<TYPE, 8> arg3 = rbuf.Load< vector<TYPE, 8> >(i++*32);
+
+  // UNARY: call {{.*}} [[DXOP:@dx.op.unary]]
+  // BINARY: call {{.*}} [[DXOP:@dx.op.binary]]
+  // TERTIARY: call {{.*}} [[DXOP:@dx.op.tertiary]]
+  // LEGACY: call {{.*}} [[DXOP:@dx.op.legacy]]
+  // SPECFLT: call {{.*}} [[DXOP:@dx.op.isSpecialFloat]]
+  // QUAD: call {{.*}} [[DXOP:@dx.op.quad]]
+  // WAVE: call {{.*}} [[DXOP:@dx.op.wave]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+  // CHECK: call {{.*}} [[DXOP]]
+
+  vector<TYPE, 8> ret = FUNC(CALLARGS(arg1, arg2, arg3));
+  return float4(ret[0] + ret[1], ret[2] + ret[3], ret[4] + ret[5], ret[6] + ret[7]);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..e32ebc1db2
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-float-intrinsics.hlsl
@@ -0,0 +1,86 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=46 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=46 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled ternary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+// Given that all we have at the moment are fmad and fma and the latter only takes doubles,
+// fma is tacked on as an additional check.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
+  vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
+  vector<float16_t, NUM> hVec3 = buf.Load<vector<float16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
+  vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
+  vector<float, NUM> fVec3 = buf.Load<vector<float, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
+  vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
+  vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
+  vector<double, NUM> dVec3 = buf.Load<vector<double, NUM> >(5120);
+
+  // Test simple matching type overloads.
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.tertiary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]], <[[NUM]] x half> [[hvec3]])
+  vector<float16_t, NUM> hRes = FUNC(hVec1, hVec2, hVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.tertiary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]], <[[NUM]] x float> [[fvec3]])
+  vector<float, NUM> fRes = FUNC(fVec1, fVec2, fVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.tertiary.[[DTY]](i32 [[OP]], <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]], <[[NUM]] x double> [[dvec3]])
+  vector<double, NUM> dRes = FUNC(dVec1, dVec2, dVec3);
+
+  // Tacked on fma() check since it only takes doubles.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x double> @dx.op.tertiary.[[DTY]](i32 47, <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]], <[[NUM]] x double> [[dvec3]])
+  vector<double, NUM> dRes2 = fma(dVec1, dVec2, dVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(2048, fRes);
+  buf.Store<vector<double, NUM> >(4096, dRes);
+  buf.Store<vector<double, NUM> >(5120, dRes2);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..50f98715e4
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-tertiary-int-intrinsics.hlsl
@@ -0,0 +1,131 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=48 -DUOP=49 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=mad   -DOP=48 -DUOP=49 -DNUM=1022 %s | FileCheck %s
+
+#ifndef UOP
+#define UOP OP
+#endif
+
+// Test vector-enabled tertiary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode numbers.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 888, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(888, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[UOP:[0-9]*]]
+  buf.Store(999, UOP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
+  // CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[svec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec1 = buf.Load<vector<int16_t, NUM> >(0);
+  vector<int16_t, NUM> sVec2 = buf.Load<vector<int16_t, NUM> >(512);
+  vector<int16_t, NUM> sVec3 = buf.Load<vector<int16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1025
+  // CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
+  // CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[usvec3:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec1 = buf.Load<vector<uint16_t, NUM> >(1025);
+  vector<uint16_t, NUM> usVec2 = buf.Load<vector<uint16_t, NUM> >(1536);
+  vector<uint16_t, NUM> usVec3 = buf.Load<vector<uint16_t, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2049
+  // CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
+  // CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[ivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec1 = buf.Load<vector<int, NUM> >(2049);
+  vector<int, NUM> iVec2 = buf.Load<vector<int, NUM> >(2560);
+  vector<int, NUM> iVec3 = buf.Load<vector<int, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3073
+  // CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
+  // CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[uivec3:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec1 = buf.Load<vector<uint, NUM> >(3073);
+  vector<uint, NUM> uiVec2 = buf.Load<vector<uint, NUM> >(3584);
+  vector<uint, NUM> uiVec3 = buf.Load<vector<uint, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4097
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[lvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec1 = buf.Load<vector<int64_t, NUM> >(4097);
+  vector<int64_t, NUM> lVec2 = buf.Load<vector<int64_t, NUM> >(4608);
+  vector<int64_t, NUM> lVec3 = buf.Load<vector<int64_t, NUM> >(5120);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5121
+  // CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
+  // CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 6144
+  // CHECK: [[ulvec3:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec1 = buf.Load<vector<uint64_t, NUM> >(5121);
+  vector<uint64_t, NUM> ulVec2 = buf.Load<vector<uint64_t, NUM> >(5632);
+  vector<uint64_t, NUM> ulVec3 = buf.Load<vector<uint64_t, NUM> >(6144);
+
+  // Test simple matching type overloads.
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.tertiary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]], <[[NUM]] x i16> [[svec3]])
+  vector<int16_t, NUM> sRes = FUNC(sVec1, sVec2, sVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.tertiary.[[STY]](i32 [[UOP]], <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]], <[[NUM]] x i16> [[usvec3]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec1, usVec2, usVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.tertiary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]], <[[NUM]] x i32> [[ivec3]])
+  vector<int, NUM> iRes = FUNC(iVec1, iVec2, iVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.tertiary.[[ITY]](i32 [[UOP]], <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]], <[[NUM]] x i32> [[uivec3]])
+  vector<uint, NUM> uiRes = FUNC(uiVec1, uiVec2, uiVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.tertiary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]], <[[NUM]] x i64> [[lvec3]])
+  vector<int64_t, NUM> lRes = FUNC(lVec1, lVec2, lVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.tertiary.[[LTY]](i32 [[UOP]], <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]], <[[NUM]] x i64> [[ulvec3]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec1, ulVec2, ulVec3);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
new file mode 100644
index 0000000000..91ab631a7e
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-float-intrinsics.hlsl
@@ -0,0 +1,83 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=saturate  -DOP=7 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cos  -DOP=12 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sin  -DOP=13 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sin  -DOP=13 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tan  -DOP=14 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tan  -DOP=14 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=acos -DOP=15 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=acos -DOP=15 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=asin -DOP=16 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=asin -DOP=16 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=atan -DOP=17 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=atan -DOP=17 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cosh -DOP=18 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=cosh -DOP=18 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sinh -DOP=19 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sinh -DOP=19 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tanh -DOP=20 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=tanh -DOP=20 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=exp2 -DOP=21 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=exp2 -DOP=21 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=frac -DOP=22 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=frac -DOP=22 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log2 -DOP=23 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log2 -DOP=23 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log10 -DOP=23 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=log10 -DOP=23 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sqrt -DOP=24 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=sqrt -DOP=24 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=rsqrt -DOP=25 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=rsqrt -DOP=25 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=round -DOP=26 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=round -DOP=26 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=floor -DOP=27 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=floor -DOP=27 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ceil -DOP=28 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=ceil -DOP=28 -DNUM=1022 %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=trunc -DOP=29 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled unary intrinsics that take float-like parameters and
+// and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
+// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
+
+[numthreads(8,1,1)]
+void main() {
+
+  // Capture opcode number.
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[hvec:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
+  vector<float16_t, NUM> hVec = buf.Load<vector<float16_t, NUM> >(0);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[fvec:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
+  vector<float, NUM> fVec = buf.Load<vector<float, NUM> >(1024);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x half> @dx.op.unary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec]])
+  vector<float16_t, NUM> hRes = FUNC(hVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x float> @dx.op.unary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec]])
+  vector<float, NUM> fRes = FUNC(fVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<float16_t, NUM> >(0, hRes);
+  buf.Store<vector<float, NUM> >(1024, fRes);
+}
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl
new file mode 100644
index 0000000000..ef0b250745
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-trivial-unary-int-intrinsics.hlsl
@@ -0,0 +1,86 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=reversebits   -DOP=30 -DNUM=7    %s | FileCheck %s
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=reversebits   -DOP=30 -DNUM=1022 %s | FileCheck %s
+
+// Test vector-enabled unary intrinsics that take signed and unsigned integer parameters of
+// different widths and are "trivial" in that they can be implemented with a single call
+// instruction with the same parameter and return types.
+
+RWByteAddressBuffer buf;
+
+// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
+// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
+// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
+
+[numthreads(8,1,1)]
+void main() {
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // Capture opcode number.
+  // CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
+  buf.Store(999, OP);
+
+  // CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
+  // CHECK: [[svec:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<int16_t, NUM> sVec = buf.Load<vector<int16_t, NUM> >(0);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
+  // CHECK: [[usvec:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
+  vector<uint16_t, NUM> usVec = buf.Load<vector<uint16_t, NUM> >(1024);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2048
+  // CHECK: [[ivec:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<int, NUM> iVec = buf.Load<vector<int, NUM> >(2048);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
+  // CHECK: [[uivec:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
+  vector<uint, NUM> uiVec = buf.Load<vector<uint, NUM> >(3072);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
+  // CHECK: [[lvec:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<int64_t, NUM> lVec = buf.Load<vector<int64_t, NUM> >(4096);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
+  // CHECK: [[ulvec:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
+  vector<uint64_t, NUM> ulVec = buf.Load<vector<uint64_t, NUM> >(5120);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.unary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec]])
+  vector<int16_t, NUM> sRes = FUNC(sVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i16> @dx.op.unary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[usvec]])
+  vector<uint16_t, NUM> usRes = FUNC(usVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.unary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec]])
+  vector<int, NUM> iRes = FUNC(iVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i32> @dx.op.unary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[uivec]])
+  vector<uint, NUM> uiRes = FUNC(uiVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.unary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec]])
+  vector<int64_t, NUM> lRes = FUNC(lVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  // CHECK: call <[[NUM]] x i64> @dx.op.unary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[ulvec]])
+  vector<uint64_t, NUM> ulRes = FUNC(ulVec);
+
+  // CHECK-NOT: extractelement
+  // CHECK-NOT: insertelement
+  buf.Store<vector<int16_t, NUM> >(0, sRes);
+  buf.Store<vector<uint16_t, NUM> >(1024, usRes);
+  buf.Store<vector<int, NUM> >(2048, iRes);
+  buf.Store<vector<uint, NUM> >(3072, uiRes);
+  buf.Store<vector<int64_t, NUM> >(4096, lRes);
+  buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
+}
diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py
index 691c3ba58f..548aae4192 100644
--- a/utils/hct/hctdb.py
+++ b/utils/hct/hctdb.py
@@ -1503,7 +1503,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the " + i,
-                "hfd",
+                "hfd<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1537,7 +1537,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the " + i,
-                "hf",
+                "hf<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1554,7 +1554,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Unary",
                 "returns the reverse bit pattern of the input value",
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1601,7 +1601,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Binary",
                 "returns the " + i + " of the input values",
-                "hfd",
+                "hfd<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1619,7 +1619,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Binary",
                 "returns the " + i + " of the input values",
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "operation result"),
@@ -1674,7 +1674,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Tertiary",
             "performs a fused multiply add (FMA) of the form a * b + c",
-            "hfd",
+            "hfd<",
             "rn",
             [
                 db_dxil_param(
@@ -1691,7 +1691,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Tertiary",
             "performs a fused multiply add (FMA) of the form a * b + c",
-            "d",
+            "d<",
             "rn",
             [
                 db_dxil_param(
@@ -1715,7 +1715,7 @@ def UFI(name, **mappings):
                 next_op_idx,
                 "Tertiary",
                 "performs an integral " + i,
-                "wil",
+                "wil<",
                 "rn",
                 [
                     db_dxil_param(0, "$o", "", "the operation result"),
@@ -2608,7 +2608,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2626,7 +2626,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per stamp",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2644,7 +2644,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(
@@ -2662,7 +2662,7 @@ def UFI(name, **mappings):
             next_op_idx,
             "Unary",
             "computes the rate of change of components per pixel",
-            "hf",
+            "hf<",
             "rn",
             [
                 db_dxil_param(

From db9b361ad0fbfe7f9710572bd73948b895d5c73f Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Mon, 7 Apr 2025 13:04:22 -0600
Subject: [PATCH 2/6] generated code update

---
 lib/DXIL/DxilOperations.cpp | 140 ++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 70 deletions(-)

diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp
index 0b4c7218d4..7047d9fe59 100644
--- a/lib/DXIL/DxilOperations.cpp
+++ b/lib/DXIL/DxilOperations.cpp
@@ -96,16 +96,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::Saturate,
      "Saturate",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::IsNaN,
      "IsNaN",
      OCC::IsSpecialFloat,
@@ -144,112 +144,112 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Sin,
      "Sin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Tan,
      "Tan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Acos,
      "Acos",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Asin,
      "Asin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Atan,
      "Atan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Hcos,
      "Hcos",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Hsin,
      "Hsin",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Htan,
      "Htan",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Exp,
      "Exp",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Frc,
      "Frc",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Log,
      "Log",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Sqrt,
      "Sqrt",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Rsqrt,
      "Rsqrt",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Unary float - rounding
     {OC::Round_ne,
@@ -258,32 +258,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_ni,
      "Round_ni",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_pi,
      "Round_pi",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::Round_z,
      "Round_z",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Unary int
     {OC::Bfrev,
@@ -292,8 +292,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::Countbits,
      "Countbits",
      OCC::UnaryBits,
@@ -338,16 +338,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::FMin,
      "FMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
 
     // Binary int
     {OC::IMax,
@@ -356,16 +356,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::IMin,
      "IMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Binary uint
     {OC::UMax,
@@ -374,16 +374,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
     {OC::UMin,
      "UMin",
      OCC::Binary,
      "binary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Binary int with two outputs
     {OC::IMul,
@@ -438,16 +438,16 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0x7}},
-     {{0x0}}}, // Overloads: hfd
+     {{0x407}},
+     {{0x7}}}, // Overloads: hfd<hfd
     {OC::Fma,
      "Fma",
      OCC::Tertiary,
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0x4}},
-     {{0x0}}}, // Overloads: d
+     {{0x404}},
+     {{0x4}}}, // Overloads: d<d
 
     // Tertiary int
     {OC::IMad,
@@ -456,8 +456,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Tertiary uint
     {OC::UMad,
@@ -466,8 +466,8 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "tertiary",
      Attribute::ReadNone,
      1,
-     {{0xe0}},
-     {{0x0}}}, // Overloads: wil
+     {{0x4e0}},
+     {{0xe0}}}, // Overloads: wil<wil
 
     // Tertiary int
     {OC::Msad,
@@ -764,32 +764,32 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = {
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivCoarseY,
      "DerivCoarseY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineX,
      "DerivFineX",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
     {OC::DerivFineY,
      "DerivFineY",
      OCC::Unary,
      "unary",
      Attribute::ReadNone,
      1,
-     {{0x3}},
-     {{0x0}}}, // Overloads: hf
+     {{0x403}},
+     {{0x3}}}, // Overloads: hf<hf
 
     // Pixel shader
     {OC::EvalSnapped,

From a7ff69e4eb262e35faadc7c8086e8f3b59e6b15f Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Mon, 7 Apr 2025 13:54:15 -0600
Subject: [PATCH 3/6] Pre-empt any and all variable capitalization discussion

Any altered function is brought inline with LLVM coding standards for
varaible capitalization.
---
 lib/HLSL/HLOperationLower.cpp | 402 +++++++++++++++++-----------------
 1 file changed, 199 insertions(+), 203 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 8dda0230ba..6292e66120 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -487,24 +487,24 @@ Value *TrivialDxilVectorOperation(Function *Func, OP::OpCode Opcode,
 // for each vector element and reconstruct the vector type from those results or
 // operate on and return native vectors depending on vector size and the value
 // of `SupportsVectors`, which is deteremined by version and opcode support.
-Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
-                            Type *Ty, Type *RetTy, OP *hlslOP,
+Value *TrivialDxilOperation(OP::OpCode Opcode, ArrayRef<Value *> Args,
+                            Type *Ty, Type *RetTy, OP *OP,
                             IRBuilder<> &Builder,
                             bool SupportsVectors = false) {
 
   // If supported and the overload type is a vector with more than 1 element,
   // create a native vector operation.
   if (SupportsVectors && Ty->isVectorTy() && Ty->getVectorNumElements() > 1) {
-    Function *dxilFunc = hlslOP->GetOpFunc(opcode, Ty);
-    return TrivialDxilVectorOperation(dxilFunc, opcode, refArgs, Ty, hlslOP,
+    Function *Func = OP->GetOpFunc(Opcode, Ty);
+    return TrivialDxilVectorOperation(Func, Opcode, Args, Ty, OP,
                                       Builder);
   }
 
   // Set overload type to the scalar type of `Ty` and generate call(s).
   Type *EltTy = Ty->getScalarType();
-  Function *dxilFunc = hlslOP->GetOpFunc(opcode, EltTy);
+  Function *Func = OP->GetOpFunc(Opcode, EltTy);
 
-  return TrivialDxilOperation(dxilFunc, opcode, refArgs, Ty, RetTy, hlslOP,
+  return TrivialDxilOperation(Func, Opcode, Args, Ty, RetTy, OP,
                               Builder);
 }
 
@@ -524,9 +524,9 @@ Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
 // return type from the overload by passing the argument, explicit return type,
 // and helper objects to the scalarizing unary dxil operation creation.
 Value *TrivialUnaryOperationRet(CallInst *CI, IntrinsicOp IOP,
-                                OP::OpCode opcode,
+                                OP::OpCode Opcode,
                                 HLOperationLowerHelper &Helper,
-                                HLObjectOperationLowerHelper *pObjHelper,
+                                HLObjectOperationLowerHelper *ObjHelper,
                                 bool &Translated) {
   Value *Src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
   Type *Ty = Src->getType();
@@ -534,96 +534,96 @@ Value *TrivialUnaryOperationRet(CallInst *CI, IntrinsicOp IOP,
   IRBuilder<> Builder(CI);
   hlsl::OP *OP = &Helper.hlslOP;
   Type *RetTy = CI->getType();
-  Constant *opArg = OP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, Src};
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src};
 
-  return TrivialDxilOperation(opcode, args, Ty, RetTy, OP, Builder);
+  return TrivialDxilOperation(Opcode, Args, Ty, RetTy, OP, Builder);
 }
 
-Value *TrivialDxilUnaryOperation(OP::OpCode opcode, Value *src,
-                                 hlsl::OP *hlslOP, IRBuilder<> &Builder,
+Value *TrivialDxilUnaryOperation(OP::OpCode Opcode, Value *Src,
+                                 hlsl::OP *OP, IRBuilder<> &Builder,
                                  bool SupportsVectors = false) {
-  Type *Ty = src->getType();
+  Type *Ty = Src->getType();
 
-  Constant *OpArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *Args[] = {OpArg, src};
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src};
 
-  return TrivialDxilOperation(opcode, Args, Ty, Ty, hlslOP, Builder,
+  return TrivialDxilOperation(Opcode, Args, Ty, Ty, OP, Builder,
                               SupportsVectors);
 }
 
-Value *TrivialDxilBinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
-                                  hlsl::OP *hlslOP, IRBuilder<> &Builder,
+Value *TrivialDxilBinaryOperation(OP::OpCode Opcode, Value *Src0, Value *Src1,
+                                  hlsl::OP *OP, IRBuilder<> &Builder,
                                   bool SupportsVectors = false) {
-  Type *Ty = src0->getType();
+  Type *Ty = Src0->getType();
 
-  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, src0, src1};
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src0, Src1};
 
-  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder,
+  return TrivialDxilOperation(Opcode, Args, Ty, Ty, OP, Builder,
                               SupportsVectors);
 }
 
-Value *TrivialDxilTrinaryOperation(OP::OpCode opcode, Value *src0, Value *src1,
-                                   Value *src2, hlsl::OP *hlslOP,
+Value *TrivialDxilTrinaryOperation(OP::OpCode Opcode, Value *Src0, Value *Src1,
+                                   Value *Src2, hlsl::OP *OP,
                                    IRBuilder<> &Builder,
                                    bool SupportsVectors = false) {
-  Type *Ty = src0->getType();
+  Type *Ty = Src0->getType();
 
-  Constant *opArg = hlslOP->GetU32Const((unsigned)opcode);
-  Value *args[] = {opArg, src0, src1, src2};
+  Constant *OpArg = OP->GetU32Const((unsigned)Opcode);
+  Value *Args[] = {OpArg, Src0, Src1, Src2};
 
-  return TrivialDxilOperation(opcode, args, Ty, Ty, hlslOP, Builder,
+  return TrivialDxilOperation(Opcode, Args, Ty, Ty, OP, Builder,
                               SupportsVectors);
 }
 
 // Translate call that trivially converts to a dxil unary operation by passing
 // argument, return type, and helper objects to either scalarizing or native
 // vector dxil operation creation depending on version and vector size.
-Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                             HLOperationLowerHelper &helper,
-                             HLObjectOperationLowerHelper *pObjHelper,
+Value *TrivialUnaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                             HLOperationLowerHelper &Helper,
+                             HLObjectOperationLowerHelper *ObjHelper,
                              bool &Translated) {
-  Value *src0 = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Value *Src0 = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
   IRBuilder<> Builder(CI);
-  hlsl::OP *hlslOP = &helper.hlslOP;
+  hlsl::OP *OP = &Helper.hlslOP;
 
-  return TrivialDxilUnaryOperation(opcode, src0, hlslOP, Builder,
-                                   helper.M.GetShaderModel()->IsSM69Plus());
+  return TrivialDxilUnaryOperation(Opcode, Src0, OP, Builder,
+                                   Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
 // Translate call that trivially converts to a dxil binary operation by passing
 // arguments, return type, and helper objects to either scalarizing or native
 // vector dxil operation creation depending on version and vector size.
-Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                              HLOperationLowerHelper &helper,
-                              HLObjectOperationLowerHelper *pObjHelper,
+Value *TrivialBinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                              HLOperationLowerHelper &Helper,
+                              HLObjectOperationLowerHelper *ObjHelper,
                               bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *src0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
-  Value *src1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Src0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Value *Src1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
   IRBuilder<> Builder(CI);
 
-  return TrivialDxilBinaryOperation(opcode, src0, src1, hlslOP, Builder,
-                                    helper.M.GetShaderModel()->IsSM69Plus());
+  return TrivialDxilBinaryOperation(Opcode, Src0, Src1, OP, Builder,
+                                    Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
 // Translate call that trivially converts to a dxil trinary (aka tertiary)
 // operation by passing arguments, return type, and helper objects to either
 // scalarizing or native vector dxil operation creation depending on version
 // and vector size.
-Value *TrivialTrinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                               HLOperationLowerHelper &helper,
-                               HLObjectOperationLowerHelper *pObjHelper,
+Value *TrivialTrinaryOperation(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                               HLOperationLowerHelper &Helper,
+                               HLObjectOperationLowerHelper *ObjHelper,
                                bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *src0 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
-  Value *src1 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
-  Value *src2 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Src0 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc0Idx);
+  Value *Src1 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc1Idx);
+  Value *Src2 = CI->getArgOperand(HLOperandIndex::kTrinaryOpSrc2Idx);
   IRBuilder<> Builder(CI);
 
-  return TrivialDxilTrinaryOperation(opcode, src0, src1, src2, hlslOP, Builder,
-                                     helper.M.GetShaderModel()->IsSM69Plus());
+  return TrivialDxilTrinaryOperation(Opcode, Src0, Src1, Src2, OP, Builder,
+                                     Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
 Value *TrivialIsSpecialFloat(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -788,54 +788,54 @@ Value *TranslateD3DColorToUByte4(CallInst *CI, IntrinsicOp IOP,
 // | float    | False               |                2 |
 // +----------+---------------------+------------------+
 
-bool CanUseFxcMulOnlyPatternForPow(IRBuilder<> &Builder, Value *x, Value *pow,
-                                   int32_t &powI) {
+bool CanUseFxcMulOnlyPatternForPow(IRBuilder<> &Builder, Value *X, Value *Pow,
+                                   int32_t &PowI) {
   // Applicable only when power is a literal.
-  if (!isa<ConstantDataVector>(pow) && !isa<ConstantFP>(pow)) {
+  if (!isa<ConstantDataVector>(Pow) && !isa<ConstantFP>(Pow)) {
     return false;
   }
 
   // Only apply this code gen on splat values.
-  if (ConstantDataVector *cdv = dyn_cast<ConstantDataVector>(pow)) {
-    if (!hlsl::dxilutil::IsSplat(cdv)) {
+  if (ConstantDataVector *Cdv = dyn_cast<ConstantDataVector>(Pow)) {
+    if (!hlsl::dxilutil::IsSplat(Cdv)) {
       return false;
     }
   }
 
   // Only apply on aggregates of 16 or fewer elements,
   // representing the max 4x4 matrix size.
-  Type *Ty = x->getType();
+  Type *Ty = X->getType();
   if (Ty->isVectorTy() && Ty->getVectorNumElements() > 16)
     return false;
 
-  APFloat powAPF = isa<ConstantDataVector>(pow)
-                       ? cast<ConstantDataVector>(pow)->getElementAsAPFloat(0)
+  APFloat PowAPF = isa<ConstantDataVector>(Pow)
+                       ? cast<ConstantDataVector>(Pow)->getElementAsAPFloat(0)
                        : // should be a splat value
-                       cast<ConstantFP>(pow)->getValueAPF();
-  APSInt powAPS(32, false);
-  bool isExact = false;
+                       cast<ConstantFP>(Pow)->getValueAPF();
+  APSInt PowAPS(32, false);
+  bool IsExact = false;
   // Try converting float value of power to integer and also check if the float
   // value is exact.
-  APFloat::opStatus status =
-      powAPF.convertToInteger(powAPS, APFloat::rmTowardZero, &isExact);
-  if (status == APFloat::opStatus::opOK && isExact) {
-    powI = powAPS.getExtValue();
-    uint32_t powU = abs(powI);
-    int setBitCount = 0;
-    int maxBitSetPos = -1;
-    for (int i = 0; i < 32; i++) {
-      if ((powU >> i) & 1) {
-        setBitCount++;
-        maxBitSetPos = i;
+  APFloat::opStatus Status =
+      PowAPF.convertToInteger(PowAPS, APFloat::rmTowardZero, &IsExact);
+  if (Status == APFloat::opStatus::opOK && IsExact) {
+    PowI = PowAPS.getExtValue();
+    uint32_t PowU = abs(PowI);
+    int SetBitCount = 0;
+    int MaxBitSetPos = -1;
+    for (int I = 0; I < 32; I++) {
+      if ((PowU >> I) & 1) {
+        SetBitCount++;
+        MaxBitSetPos = I;
       }
     }
 
-    DXASSERT(maxBitSetPos <= 30, "msb should always be zero.");
-    unsigned numElem =
-        isa<ConstantDataVector>(pow) ? x->getType()->getVectorNumElements() : 1;
-    int mulOpThreshold = powI < 0 ? numElem + 1 : 2 * numElem + 1;
-    int mulOpNeeded = maxBitSetPos + setBitCount - 1;
-    return mulOpNeeded <= mulOpThreshold;
+    DXASSERT(MaxBitSetPos <= 30, "msb should always be zero.");
+    unsigned NumElem =
+        isa<ConstantDataVector>(Pow) ? X->getType()->getVectorNumElements() : 1;
+    int MulOpThreshold = PowI < 0 ? NumElem + 1 : 2 * NumElem + 1;
+    int MulOpNeeded = MaxBitSetPos + SetBitCount - 1;
+    return MulOpNeeded <= MulOpThreshold;
   }
 
   return false;
@@ -2087,46 +2087,44 @@ Value *TranslateDst(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return Result;
 }
 
-Value *TranslateFirstbitHi(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                           HLOperationLowerHelper &helper,
-                           HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateFirstbitHi(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                           HLOperationLowerHelper &Helper,
+                           HLObjectOperationLowerHelper *ObjHelper,
                            bool &Translated) {
-  Value *firstbitHi =
-      TrivialUnaryOperationRet(CI, IOP, opcode, helper, pObjHelper, Translated);
+  Value *FirstbitHi =
+      TrivialUnaryOperationRet(CI, IOP, Opcode, Helper, ObjHelper, Translated);
   // firstbitHi == -1? -1 : (bitWidth-1 -firstbitHi);
   IRBuilder<> Builder(CI);
-  Constant *neg1 = Builder.getInt32(-1);
-  Value *src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Neg1 = Builder.getInt32(-1);
+  Value *Src = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
 
-  Type *Ty = src->getType();
+  Type *Ty = Src->getType();
   IntegerType *EltTy = cast<IntegerType>(Ty->getScalarType());
-  Constant *bitWidth = Builder.getInt32(EltTy->getBitWidth() - 1);
+  Constant *BitWidth = Builder.getInt32(EltTy->getBitWidth() - 1);
 
   if (Ty == Ty->getScalarType()) {
-    Value *sub = Builder.CreateSub(bitWidth, firstbitHi);
-    Value *cond = Builder.CreateICmpEQ(neg1, firstbitHi);
-    return Builder.CreateSelect(cond, neg1, sub);
+    Value *Sub = Builder.CreateSub(BitWidth, FirstbitHi);
+    Value *Cond = Builder.CreateICmpEQ(Neg1, FirstbitHi);
+    return Builder.CreateSelect(Cond, Neg1, Sub);
   } else {
-    Value *result = UndefValue::get(CI->getType());
-    unsigned vecSize = Ty->getVectorNumElements();
-    for (unsigned i = 0; i < vecSize; i++) {
-      Value *EltFirstBit = Builder.CreateExtractElement(firstbitHi, i);
-      Value *sub = Builder.CreateSub(bitWidth, EltFirstBit);
-      Value *cond = Builder.CreateICmpEQ(neg1, EltFirstBit);
-      Value *Elt = Builder.CreateSelect(cond, neg1, sub);
-      result = Builder.CreateInsertElement(result, Elt, i);
+    Value *Result = UndefValue::get(CI->getType());
+    unsigned VecSize = Ty->getVectorNumElements();
+    for (unsigned I = 0; I < VecSize; I++) {
+      Value *EltFirstBit = Builder.CreateExtractElement(FirstbitHi, I);
+      Value *Sub = Builder.CreateSub(BitWidth, EltFirstBit);
+      Value *Cond = Builder.CreateICmpEQ(Neg1, EltFirstBit);
+      Value *Elt = Builder.CreateSelect(Cond, Neg1, Sub);
+      Result = Builder.CreateInsertElement(Result, Elt, I);
     }
-    return result;
+    return Result;
   }
 }
 
-Value *TranslateFirstbitLo(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                           HLOperationLowerHelper &helper,
-                           HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateFirstbitLo(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                           HLOperationLowerHelper &Helper,
+                           HLObjectOperationLowerHelper *ObjHelper,
                            bool &Translated) {
-  Value *firstbitLo =
-      TrivialUnaryOperationRet(CI, IOP, opcode, helper, pObjHelper, Translated);
-  return firstbitLo;
+  return TrivialUnaryOperationRet(CI, IOP, Opcode, Helper, ObjHelper, Translated);
 }
 
 Value *TranslateLit(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2273,63 +2271,61 @@ Value *TranslateDistance(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
   return TranslateLength(CI, sub, hlslOP);
 }
 
-Value *TranslateExp(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateExp(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper,
                     bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+  hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
-  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
-  Constant *log2eConst = ConstantFP::get(Ty->getScalarType(), M_LOG2E);
-  if (Ty != Ty->getScalarType()) {
-    log2eConst =
-        ConstantVector::getSplat(Ty->getVectorNumElements(), log2eConst);
-  }
-  val = Builder.CreateFMul(log2eConst, val);
+  Value *Val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Log2eConst = ConstantFP::get(Ty->getScalarType(), M_LOG2E);
+  if (Ty != Ty->getScalarType())
+    Log2eConst =
+        ConstantVector::getSplat(Ty->getVectorNumElements(), Log2eConst);
+  Val = Builder.CreateFMul(Log2eConst, Val);
 
-  return TrivialDxilUnaryOperation(OP::OpCode::Exp, val, hlslOP, Builder,
-                                   helper.M.GetShaderModel()->IsSM69Plus());
+  return TrivialDxilUnaryOperation(OP::OpCode::Exp, Val, OP, Builder,
+                                   Helper.M.GetShaderModel()->IsSM69Plus());
 }
 
-Value *TranslateLog(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateLog(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper,
                     bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+  hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
-  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
-  Constant *ln2Const = ConstantFP::get(Ty->getScalarType(), M_LN2);
-  if (Ty != Ty->getScalarType()) {
-    ln2Const = ConstantVector::getSplat(Ty->getVectorNumElements(), ln2Const);
-  }
+  Value *Val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Ln2Const = ConstantFP::get(Ty->getScalarType(), M_LN2);
+  if (Ty != Ty->getScalarType())
+    Ln2Const = ConstantVector::getSplat(Ty->getVectorNumElements(), Ln2Const);
 
   Value *log =
-      TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder,
-                                helper.M.GetShaderModel()->IsSM69Plus());
+      TrivialDxilUnaryOperation(OP::OpCode::Log, Val, OP, Builder,
+                                Helper.M.GetShaderModel()->IsSM69Plus());
 
-  return Builder.CreateFMul(ln2Const, log);
+  return Builder.CreateFMul(Ln2Const, log);
 }
 
-Value *TranslateLog10(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                      HLOperationLowerHelper &helper,
-                      HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateLog10(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                      HLOperationLowerHelper &Helper,
+                      HLObjectOperationLowerHelper *ObjHelper,
                       bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+  hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
-  Value *val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
-  Constant *log2_10Const = ConstantFP::get(Ty->getScalarType(), M_LN2 / M_LN10);
+  Value *Val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
+  Constant *Log2to10Const = ConstantFP::get(Ty->getScalarType(), M_LN2 / M_LN10);
   if (Ty != Ty->getScalarType()) {
-    log2_10Const =
-        ConstantVector::getSplat(Ty->getVectorNumElements(), log2_10Const);
+    Log2to10Const =
+        ConstantVector::getSplat(Ty->getVectorNumElements(), Log2to10Const);
   }
-  Value *log =
-      TrivialDxilUnaryOperation(OP::OpCode::Log, val, hlslOP, Builder,
-                                helper.M.GetShaderModel()->IsSM69Plus());
+  Value *Log =
+      TrivialDxilUnaryOperation(OP::OpCode::Log, Val, OP, Builder,
+                                Helper.M.GetShaderModel()->IsSM69Plus());
 
-  return Builder.CreateFMul(log2_10Const, log);
+  return Builder.CreateFMul(Log2to10Const, Log);
 }
 
 Value *TranslateFMod(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2512,16 +2508,16 @@ Value *TrivialDotOperation(OP::OpCode opcode, Value *src0, Value *src1,
 
 // Instead of using a DXIL intrinsic, implement a dot product operation using
 // multiply and add operations. Used for integer dots and long vectors.
-Value *ExpandDot(Value *arg0, Value *arg1, unsigned vecSize, hlsl::OP *hlslOP,
+Value *ExpandDot(Value *Arg0, Value *Arg1, unsigned VecSize, hlsl::OP *OP,
                  IRBuilder<> &Builder,
                  DXIL::OpCode MadOpCode = DXIL::OpCode::IMad) {
-  Value *Elt0 = Builder.CreateExtractElement(arg0, (uint64_t)0);
-  Value *Elt1 = Builder.CreateExtractElement(arg1, (uint64_t)0);
+  Value *Elt0 = Builder.CreateExtractElement(Arg0, (uint64_t)0);
+  Value *Elt1 = Builder.CreateExtractElement(Arg1, (uint64_t)0);
   Value *Result = Builder.CreateMul(Elt0, Elt1);
-  for (unsigned Elt = 1; Elt < vecSize; ++Elt) {
-    Elt0 = Builder.CreateExtractElement(arg0, Elt);
-    Elt1 = Builder.CreateExtractElement(arg1, Elt);
-    Result = TrivialDxilTrinaryOperation(MadOpCode, Elt0, Elt1, Result, hlslOP,
+  for (unsigned Elt = 1; Elt < VecSize; ++Elt) {
+    Elt0 = Builder.CreateExtractElement(Arg0, Elt);
+    Elt1 = Builder.CreateExtractElement(Arg1, Elt);
+    Result = TrivialDxilTrinaryOperation(MadOpCode, Elt0, Elt1, Result, OP,
                                          Builder);
   }
 
@@ -2550,26 +2546,26 @@ Value *TranslateFDot(Value *arg0, Value *arg1, unsigned vecSize,
   }
 }
 
-Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper,
                     bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
-  Type *Ty = arg0->getType();
-  unsigned vecSize = Ty->getVectorNumElements();
-  Value *arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Type *Ty = Arg0->getType();
+  unsigned VecSize = Ty->getVectorNumElements();
+  Value *Arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
   IRBuilder<> Builder(CI);
   Type *EltTy = Ty->getScalarType();
   if (EltTy->isFloatingPointTy() && Ty->getVectorNumElements() <= 4) {
-    return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
+    return TranslateFDot(Arg0, Arg1, VecSize, OP, Builder);
   } else {
     DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
     if (IOP == IntrinsicOp::IOP_udot)
       MadOpCode = DXIL::OpCode::UMad;
     else if (EltTy->isFloatingPointTy())
       MadOpCode = DXIL::OpCode::FMad;
-    return ExpandDot(arg0, arg1, vecSize, hlslOP, Builder, MadOpCode);
+    return ExpandDot(Arg0, Arg1, VecSize, OP, Builder, MadOpCode);
   }
 }
 
@@ -2672,32 +2668,32 @@ Value *TranslateRefract(CallInst *CI, IntrinsicOp IOP, OP::OpCode op,
   return refract;
 }
 
-Value *TranslateSmoothStep(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                           HLOperationLowerHelper &helper,
-                           HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateSmoothStep(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                           HLOperationLowerHelper &Helper,
+                           HLObjectOperationLowerHelper *ObjHelper,
                            bool &Translated) {
-  hlsl::OP *hlslOP = &helper.hlslOP;
+  hlsl::OP *OP = &Helper.hlslOP;
   // s = saturate((x-min)/(max-min)).
   IRBuilder<> Builder(CI);
-  Value *minVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMinIdx);
-  Value *maxVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMaxIdx);
-  Value *maxSubMin = Builder.CreateFSub(maxVal, minVal);
-  Value *x = CI->getArgOperand(HLOperandIndex::kSmoothStepOpXIdx);
-  Value *xSubMin = Builder.CreateFSub(x, minVal);
-  Value *satVal = Builder.CreateFDiv(xSubMin, maxSubMin);
-
-  Value *s =
-      TrivialDxilUnaryOperation(DXIL::OpCode::Saturate, satVal, hlslOP, Builder,
-                                helper.M.GetShaderModel()->IsSM69Plus());
+  Value *MinVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMinIdx);
+  Value *MaxVal = CI->getArgOperand(HLOperandIndex::kSmoothStepOpMaxIdx);
+  Value *MaxSubMin = Builder.CreateFSub(MaxVal, MinVal);
+  Value *X = CI->getArgOperand(HLOperandIndex::kSmoothStepOpXIdx);
+  Value *XSubMin = Builder.CreateFSub(X, MinVal);
+  Value *SatVal = Builder.CreateFDiv(XSubMin, MaxSubMin);
+
+  Value *S =
+      TrivialDxilUnaryOperation(DXIL::OpCode::Saturate, SatVal, OP, Builder,
+                                Helper.M.GetShaderModel()->IsSM69Plus());
   // return s * s *(3-2*s).
-  Constant *c2 = ConstantFP::get(CI->getType(), 2);
-  Constant *c3 = ConstantFP::get(CI->getType(), 3);
+  Constant *C2 = ConstantFP::get(CI->getType(), 2);
+  Constant *C3 = ConstantFP::get(CI->getType(), 3);
 
-  Value *sMul2 = Builder.CreateFMul(s, c2);
-  Value *result = Builder.CreateFSub(c3, sMul2);
-  result = Builder.CreateFMul(s, result);
-  result = Builder.CreateFMul(s, result);
-  return result;
+  Value *SMul2 = Builder.CreateFMul(S, C2);
+  Value *Result = Builder.CreateFSub(C3, SMul2);
+  Result = Builder.CreateFMul(S, Result);
+  Result = Builder.CreateFMul(S, Result);
+  return Result;
 }
 
 Value *TranslateMSad4(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -3099,47 +3095,47 @@ Value *SplatToVector(Value *Elt, Type *DstTy, IRBuilder<> &Builder) {
   return Result;
 }
 
-Value *TranslateMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
-                    HLOperationLowerHelper &helper,
-                    HLObjectOperationLowerHelper *pObjHelper,
+Value *TranslateMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
+                    HLOperationLowerHelper &Helper,
+                    HLObjectOperationLowerHelper *ObjHelper,
                     bool &Translated) {
 
-  hlsl::OP *hlslOP = &helper.hlslOP;
-  Value *arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
-  Value *arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
-  Type *arg0Ty = arg0->getType();
-  Type *arg1Ty = arg1->getType();
+  hlsl::OP *OP = &Helper.hlslOP;
+  Value *Arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
+  Value *Arg1 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc1Idx);
+  Type *Arg0Ty = Arg0->getType();
+  Type *Arg1Ty = Arg1->getType();
   IRBuilder<> Builder(CI);
 
-  if (arg0Ty->isVectorTy()) {
-    if (arg1Ty->isVectorTy()) {
+  if (Arg0Ty->isVectorTy()) {
+    if (Arg1Ty->isVectorTy()) {
       // mul(vector, vector) == dot(vector, vector)
-      unsigned vecSize = arg0Ty->getVectorNumElements();
-      if (arg0Ty->getScalarType()->isFloatingPointTy()) {
-        return TranslateFDot(arg0, arg1, vecSize, hlslOP, Builder);
+      unsigned VecSize = Arg0Ty->getVectorNumElements();
+      if (Arg0Ty->getScalarType()->isFloatingPointTy()) {
+        return TranslateFDot(Arg0, Arg1, VecSize, OP, Builder);
       } else {
         DXIL::OpCode MadOpCode = DXIL::OpCode::IMad;
         if (IOP == IntrinsicOp::IOP_umul)
           MadOpCode = DXIL::OpCode::UMad;
-        return ExpandDot(arg0, arg1, vecSize, hlslOP, Builder, MadOpCode);
+        return ExpandDot(Arg0, Arg1, VecSize, OP, Builder, MadOpCode);
       }
     } else {
       // mul(vector, scalar) == vector * scalar-splat
-      arg1 = SplatToVector(arg1, arg0Ty, Builder);
+      Arg1 = SplatToVector(Arg1, Arg0Ty, Builder);
     }
   } else {
-    if (arg1Ty->isVectorTy()) {
+    if (Arg1Ty->isVectorTy()) {
       // mul(scalar, vector) == scalar-splat * vector
-      arg0 = SplatToVector(arg0, arg1Ty, Builder);
+      Arg0 = SplatToVector(Arg0, Arg1Ty, Builder);
     }
     // else mul(scalar, scalar) == scalar * scalar;
   }
 
   // create fmul/mul for the pair of vectors or scalars
-  if (arg0Ty->getScalarType()->isFloatingPointTy()) {
-    return Builder.CreateFMul(arg0, arg1);
+  if (Arg0Ty->getScalarType()->isFloatingPointTy()) {
+    return Builder.CreateFMul(Arg0, Arg1);
   } else {
-    return Builder.CreateMul(arg0, arg1);
+    return Builder.CreateMul(Arg0, Arg1);
   }
 }
 

From 3f7b1086662f1335696f2f891a4e1deaa79eb09a Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Mon, 7 Apr 2025 13:57:59 -0600
Subject: [PATCH 4/6] clang-format

---
 lib/HLSL/HLOperationLower.cpp | 41 +++++++++++++++--------------------
 1 file changed, 18 insertions(+), 23 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 6292e66120..ed6bdc3f6d 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -474,8 +474,8 @@ Value *TrivialDxilOperation(Function *dxilFunc, OP::OpCode opcode,
 // Utility objects `HlslOp` and `Builder` are used to create a call to the given
 // `DxilFunc` with `RefArgs` arguments.
 Value *TrivialDxilVectorOperation(Function *Func, OP::OpCode Opcode,
-                                  ArrayRef<Value *> Args, Type *Ty,
-                                  OP *OP, IRBuilder<> &Builder) {
+                                  ArrayRef<Value *> Args, Type *Ty, OP *OP,
+                                  IRBuilder<> &Builder) {
   if (!Ty->isVoidTy())
     return Builder.CreateCall(Func, Args, OP->GetOpCodeName(Opcode));
   else
@@ -487,25 +487,22 @@ Value *TrivialDxilVectorOperation(Function *Func, OP::OpCode Opcode,
 // for each vector element and reconstruct the vector type from those results or
 // operate on and return native vectors depending on vector size and the value
 // of `SupportsVectors`, which is deteremined by version and opcode support.
-Value *TrivialDxilOperation(OP::OpCode Opcode, ArrayRef<Value *> Args,
-                            Type *Ty, Type *RetTy, OP *OP,
-                            IRBuilder<> &Builder,
+Value *TrivialDxilOperation(OP::OpCode Opcode, ArrayRef<Value *> Args, Type *Ty,
+                            Type *RetTy, OP *OP, IRBuilder<> &Builder,
                             bool SupportsVectors = false) {
 
   // If supported and the overload type is a vector with more than 1 element,
   // create a native vector operation.
   if (SupportsVectors && Ty->isVectorTy() && Ty->getVectorNumElements() > 1) {
     Function *Func = OP->GetOpFunc(Opcode, Ty);
-    return TrivialDxilVectorOperation(Func, Opcode, Args, Ty, OP,
-                                      Builder);
+    return TrivialDxilVectorOperation(Func, Opcode, Args, Ty, OP, Builder);
   }
 
   // Set overload type to the scalar type of `Ty` and generate call(s).
   Type *EltTy = Ty->getScalarType();
   Function *Func = OP->GetOpFunc(Opcode, EltTy);
 
-  return TrivialDxilOperation(Func, Opcode, Args, Ty, RetTy, OP,
-                              Builder);
+  return TrivialDxilOperation(Func, Opcode, Args, Ty, RetTy, OP, Builder);
 }
 
 Value *TrivialDxilOperation(OP::OpCode opcode, ArrayRef<Value *> refArgs,
@@ -540,8 +537,8 @@ Value *TrivialUnaryOperationRet(CallInst *CI, IntrinsicOp IOP,
   return TrivialDxilOperation(Opcode, Args, Ty, RetTy, OP, Builder);
 }
 
-Value *TrivialDxilUnaryOperation(OP::OpCode Opcode, Value *Src,
-                                 hlsl::OP *OP, IRBuilder<> &Builder,
+Value *TrivialDxilUnaryOperation(OP::OpCode Opcode, Value *Src, hlsl::OP *OP,
+                                 IRBuilder<> &Builder,
                                  bool SupportsVectors = false) {
   Type *Ty = Src->getType();
 
@@ -2124,7 +2121,8 @@ Value *TranslateFirstbitLo(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
                            HLOperationLowerHelper &Helper,
                            HLObjectOperationLowerHelper *ObjHelper,
                            bool &Translated) {
-  return TrivialUnaryOperationRet(CI, IOP, Opcode, Helper, ObjHelper, Translated);
+  return TrivialUnaryOperationRet(CI, IOP, Opcode, Helper, ObjHelper,
+                                  Translated);
 }
 
 Value *TranslateLit(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
@@ -2273,8 +2271,7 @@ Value *TranslateDistance(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode,
 
 Value *TranslateExp(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
                     HLOperationLowerHelper &Helper,
-                    HLObjectOperationLowerHelper *ObjHelper,
-                    bool &Translated) {
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
   hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
@@ -2291,8 +2288,7 @@ Value *TranslateExp(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
 
 Value *TranslateLog(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
                     HLOperationLowerHelper &Helper,
-                    HLObjectOperationLowerHelper *ObjHelper,
-                    bool &Translated) {
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
   hlsl::OP *OP = &Helper.hlslOP;
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
@@ -2316,7 +2312,8 @@ Value *TranslateLog10(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
   IRBuilder<> Builder(CI);
   Type *Ty = CI->getType();
   Value *Val = CI->getArgOperand(HLOperandIndex::kUnaryOpSrc0Idx);
-  Constant *Log2to10Const = ConstantFP::get(Ty->getScalarType(), M_LN2 / M_LN10);
+  Constant *Log2to10Const =
+      ConstantFP::get(Ty->getScalarType(), M_LN2 / M_LN10);
   if (Ty != Ty->getScalarType()) {
     Log2to10Const =
         ConstantVector::getSplat(Ty->getVectorNumElements(), Log2to10Const);
@@ -2517,8 +2514,8 @@ Value *ExpandDot(Value *Arg0, Value *Arg1, unsigned VecSize, hlsl::OP *OP,
   for (unsigned Elt = 1; Elt < VecSize; ++Elt) {
     Elt0 = Builder.CreateExtractElement(Arg0, Elt);
     Elt1 = Builder.CreateExtractElement(Arg1, Elt);
-    Result = TrivialDxilTrinaryOperation(MadOpCode, Elt0, Elt1, Result, OP,
-                                         Builder);
+    Result =
+        TrivialDxilTrinaryOperation(MadOpCode, Elt0, Elt1, Result, OP, Builder);
   }
 
   return Result;
@@ -2548,8 +2545,7 @@ Value *TranslateFDot(Value *arg0, Value *arg1, unsigned vecSize,
 
 Value *TranslateDot(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
                     HLOperationLowerHelper &Helper,
-                    HLObjectOperationLowerHelper *ObjHelper,
-                    bool &Translated) {
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
   hlsl::OP *OP = &Helper.hlslOP;
   Value *Arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);
   Type *Ty = Arg0->getType();
@@ -3097,8 +3093,7 @@ Value *SplatToVector(Value *Elt, Type *DstTy, IRBuilder<> &Builder) {
 
 Value *TranslateMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode Opcode,
                     HLOperationLowerHelper &Helper,
-                    HLObjectOperationLowerHelper *ObjHelper,
-                    bool &Translated) {
+                    HLObjectOperationLowerHelper *ObjHelper, bool &Translated) {
 
   hlsl::OP *OP = &Helper.hlslOP;
   Value *Arg0 = CI->getArgOperand(HLOperandIndex::kBinaryOpSrc0Idx);

From 907cdba8c14a5c70f41a4aed90360c8afb5ac3bd Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Mon, 7 Apr 2025 15:03:06 -0600
Subject: [PATCH 5/6] Fix wrong mul type and tighted up dot() testing

Was using int dot for the float operands as it was originally an
int-only lowering function.
---
 lib/HLSL/HLOperationLower.cpp                   |  6 +++++-
 .../types/longvec-scalarized-intrinsics.hlsl    | 17 +++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index ed6bdc3f6d..a2bab818a8 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -2510,7 +2510,11 @@ Value *ExpandDot(Value *Arg0, Value *Arg1, unsigned VecSize, hlsl::OP *OP,
                  DXIL::OpCode MadOpCode = DXIL::OpCode::IMad) {
   Value *Elt0 = Builder.CreateExtractElement(Arg0, (uint64_t)0);
   Value *Elt1 = Builder.CreateExtractElement(Arg1, (uint64_t)0);
-  Value *Result = Builder.CreateMul(Elt0, Elt1);
+  Value *Result;
+  if (Elt0->getType()->isFloatingPointTy())
+    Result = Builder.CreateFMul(Elt0, Elt1);
+  else
+    Result = Builder.CreateMul(Elt0, Elt1);
   for (unsigned Elt = 1; Elt < VecSize; ++Elt) {
     Elt0 = Builder.CreateExtractElement(Arg0, Elt);
     Elt1 = Builder.CreateExtractElement(Arg1, Elt);
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
index 4886f04e01..7d5da99e21 100644
--- a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-scalarized-intrinsics.hlsl
@@ -104,14 +104,15 @@ float4 main(uint i : SV_PrimitiveID, bool b : B) : SV_Target {
   // CHECK: fsub fast <8 x float>
   vec1 = modf(vec1, vec2);
 
-  // CHECK: fmul fast float
-  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
-  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
-  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
-  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
-  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
-  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
-  // CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
+  // CHECK: [[el:%.*]] = extractelement <8 x float>
+  // CHECK: [[mul:%.*]] = fmul fast float [[el]]
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mul]]) ; FMad(a,b,c)
+  // CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+  // CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
+  // CHECK: [[pong:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[ping]]) ; FMad(a,b,c)
+  // CHECK: [[ping:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[pong]]) ; FMad(a,b,c)
   vec1 = dot(vec1, vec2);
 
   vector<bool, 8> bvec = b;

From dcc76b44daa2f7c2adf0d0e21c1bc6c21f293a9f Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Tue, 8 Apr 2025 08:05:57 -0600
Subject: [PATCH 6/6] Add IR test for dxilgen pass

---
 .../passes/longvec-intrinsics.hlsl            | 186 ++++++++
 .../CodeGenDXIL/passes/longvec-intrinsics.ll  | 434 ++++++++++++++++++
 2 files changed, 620 insertions(+)
 create mode 100644 tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
 create mode 100644 tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll

diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
new file mode 100644
index 0000000000..11d705305d
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.hlsl
@@ -0,0 +1,186 @@
+// RUN: %dxc -T cs_6_9 -enable-16bit-types -DNUM=13   %s | FileCheck %s
+
+// Source for dxilgen test CodeGenDXIL/passes/longvec-intrinsics.ll.
+// Some targetted filecheck testing as an incidental.
+
+RWStructuredBuffer<vector<float16_t, NUM> > hBuf;
+RWStructuredBuffer<vector<float, NUM> > fBuf;
+RWStructuredBuffer<vector<double, NUM> > dBuf;
+
+RWStructuredBuffer<vector<bool, NUM> > bBuf;
+RWStructuredBuffer<vector<uint, NUM> > uBuf;
+RWStructuredBuffer<vector<int64_t, NUM> > lBuf;
+
+[numthreads(8,1,1)]
+void main() {
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 11, i32 0, i32 4) 
+  // CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 12, i32 0, i32 4) 
+  // CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f32 @dx.op.rawBufferVectorLoad.v13f32(i32 303, %dx.types.Handle {{%.*}}, i32 13, i32 0, i32 4) 
+  // CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.v13f32 [[ld]], 0
+  vector<float, NUM> fVec1 = fBuf[11];
+  vector<float, NUM> fVec2 = fBuf[12];
+  vector<float, NUM> fVec3 = fBuf[13];
+  
+  // CHECK: [[tmp:%.*]] = call <13 x float> @dx.op.binary.v13f32(i32 35, <13 x float> [[fvec1]], <13 x float> [[fvec2]])  ; FMax(a,b)
+  // CHECK: call <13 x float> @dx.op.binary.v13f32(i32 36, <13 x float> [[tmp]], <13 x float> [[fvec3]])  ; FMin(a,b)
+  vector<float, NUM> fRes = clamp(fVec1, fVec2, fVec3);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 14, i32 0, i32 2) 
+  // CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 15, i32 0, i32 2) 
+  // CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f16 @dx.op.rawBufferVectorLoad.v13f16(i32 303, %dx.types.Handle {{%.*}}, i32 16, i32 0, i32 2) 
+  // CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.v13f16 [[ld]], 0
+  vector<float16_t, NUM> hVec1 = hBuf[14];
+  vector<float16_t, NUM> hVec2 = hBuf[15];
+  vector<float16_t, NUM> hVec3 = hBuf[16];
+
+  // CHECK: [[tmp:%.*]] = fcmp fast olt <13 x half> [[hvec2]], [[hvec1]]
+  // CHECK: select <13 x i1> [[tmp]], <13 x half> zeroinitializer, <13 x half> <half 0xH3C00
+  vector<float16_t, NUM> hRes = step(hVec1, hVec2);
+
+  // CHECK: [[tmp:%.*]] = fmul fast <13 x float> [[fvec1]], <float 0x
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 21, <13 x float> [[tmp]])  ; Exp(value)
+  fRes += exp(fVec1);
+
+  // CHECK: [[tmp:%.*]] = call <13 x half> @dx.op.unary.v13f16(i32 23, <13 x half> [[hvec1]])  ; Log(value)
+  // CHECK: fmul fast <13 x half> [[tmp]], <half 0xH398C
+  hRes += log(hVec1);
+
+  // CHECK: [[sub:%.*]] = fsub fast <13 x float> [[fvec2]], [[fvec1]]
+  // CHECK: [[xsub:%.*]] = fsub fast <13 x float> [[fvec3]], [[fvec1]]
+  // CHECK: [[div:%.*]] = fdiv fast <13 x float> [[xsub]], [[sub]]
+  // CHECK: [[sat:%.*]] = call <13 x float> @dx.op.unary.v13f32(i32 7, <13 x float> [[div]])  ; Saturate(value)
+  // CHECK: [[mul:%.*]] = fmul fast <13 x float> [[sat]], <float 2.000000e+00,
+  // CHECK: [[sub:%.*]] = fsub fast <13 x float> <float 3.000000e+00, {{.*}}>, [[mul]]
+  // CHECK: [[mul:%.*]] = fmul fast <13 x float> [[sat]], [[sat]]
+  // CHECK: fmul fast <13 x float> [[mul]], [[sub]]
+  fRes += smoothstep(fVec1, fVec2, fVec3);
+
+  // Intrinsics that expand into llvm ops.
+
+  // CHECK: fmul fast <13 x float> [[fvec3]], <float 0x3F91DF46A0000000
+  fRes += radians(fVec3);
+
+  // CHECK: [[cmp:%.*]] = fcmp fast une <13 x float> [[fvec1]], zeroinitializer
+  // CHECK: [[f2i:%.*]] = bitcast <13 x float> [[fvec1]] to <13 x i32>
+  // CHECK: [[and:%.*]] = and <13 x i32> [[f2i]], <i32 2139095040
+  // CHECK: [[add:%.*]] = add nsw <13 x i32> [[and]], <i32 -1056964608
+  // CHECK: [[shr:%.*]] = ashr <13 x i32> [[add]], <i32 23
+  // CHECK: [[i2f:%.*]] = sitofp <13 x i32> [[shr]] to <13 x float>
+  // CHECK: [[sel:%.*]] = select <13 x i1> [[cmp]], <13 x float> [[i2f]], <13 x float> zeroinitializer
+  // CHECK: [[and:%.*]] = and <13 x i32> [[f2i]], <i32 8388607
+  // CHECK: or <13 x i32> [[and]], <i32 1056964608
+  vector<float, NUM> exp = fVec3;
+  fRes += frexp(fVec1, exp);
+  fRes += exp;
+
+  // CHECK: [[tmp:%.*]] = fsub fast <13 x half> [[hvec3]], [[hvec2]]
+  // CHECK: fmul fast <13 x half> [[tmp]], [[hvec1]]
+  hRes += lerp(hVec2, hVec3, hVec1);
+
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 17, i32 0, i32 4) 
+  // CHECK: [[uvec1:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 18, i32 0, i32 4) 
+  // CHECK: [[uvec2:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  vector<uint, NUM> uVec1 = uBuf[17];
+  vector<uint, NUM> uVec2 = uBuf[18];
+
+  vector<uint, NUM> signs = 1;
+  // CHECK: [[cmp:%.*]] = icmp ne <13 x i32> [[uvec2]], zeroinitializer
+  // CHECK: zext <13 x i1> [[cmp]] to <13 x i32>
+  signs *= sign(uVec2);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i64 @dx.op.rawBufferVectorLoad.v13i64(i32 303, %dx.types.Handle {{%.*}}, i32 19, i32 0, i32 8) 
+  // CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.v13i64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i64 @dx.op.rawBufferVectorLoad.v13i64(i32 303, %dx.types.Handle {{%.*}}, i32 20, i32 0, i32 8) 
+  // CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.v13i64 [[ld]], 0
+  vector<int64_t, NUM> lVec1 = lBuf[19];
+  vector<int64_t, NUM> lVec2 = lBuf[20];
+
+  // CHECK: [[gt:%.*]] = icmp sgt <13 x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[lt:%.*]] = icmp slt <13 x i64> [[lvec2]], zeroinitializer
+  // CHECK: [[igt:%.*]] = zext <13 x i1> [[gt]] to <13 x i32>
+  // CHECK: [[ilt:%.*]] = zext <13 x i1> [[lt]] to <13 x i32>
+  // CHECK: sub nsw <13 x i32> [[igt]], [[ilt]]
+  signs *= sign(lVec2);
+
+  vector<uint, NUM> uRes = signs;
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 21, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec1:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 22, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec2:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13i32 @dx.op.rawBufferVectorLoad.v13i32(i32 303, %dx.types.Handle {{%.*}}, i32 23, i32 0, i32 4) 
+  // CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v13i32 [[ld]], 0
+  // CHECK: [[bvec:%.*]] = icmp ne <13 x i32> [[vec]], zeroinitializer
+  // CHECK: [[vec3:%.*]] = zext <13 x i1> [[bvec]] to <13 x i32>
+  vector<bool, NUM> bVec1 = bBuf[21];
+  vector<bool, NUM> bVec2 = bBuf[22];
+  vector<bool, NUM> bVec3 = bBuf[23];
+
+  // CHECK: [[bvec2:%.*]] = icmp ne <13 x i32> [[vec2]], zeroinitializer
+  // CHECK: [[bvec1:%.*]] = icmp ne <13 x i32> [[vec1]], zeroinitializer
+  // CHECK: or <13 x i1> [[bvec2]], [[bvec1]]
+  uRes += or(bVec1, bVec2);
+
+  // CHECK: [[bvec3:%.*]] = icmp ne <13 x i32> [[vec3]], zeroinitializer
+  // CHECK: and <13 x i1> [[bvec3]], [[bvec2]]
+  uRes += and(bVec2, bVec3);
+
+  // CHECK: select <13 x i1> [[bvec3]], <13 x i64> [[lvec1]], <13 x i64> [[lvec2]]
+  vector<int64_t, NUM> lRes = select(bVec3, lVec1, lVec2);
+
+  // CHECK: [[el1:%.*]] = extractelement <13 x float> [[fvec1]]
+  // CHECK: [[el2:%.*]] = extractelement <13 x float> [[fvec2]]
+  // CHECK: [[mul:%.*]] = fmul fast float [[el2]], [[el1]]
+  // CHECK: [[mad1:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mul]]) ; FMad(a,b,c)
+  // CHECK: [[mad2:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad1]]) ; FMad(a,b,c)
+  // CHECK: [[mad3:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad2]]) ; FMad(a,b,c)
+  // CHECK: [[mad4:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad3]]) ; FMad(a,b,c)
+  // CHECK: [[mad5:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad4]]) ; FMad(a,b,c)
+  // CHECK: [[mad6:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad5]]) ; FMad(a,b,c)
+  // CHECK: [[mad7:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad6]]) ; FMad(a,b,c)
+  // CHECK: [[mad8:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad7]]) ; FMad(a,b,c)
+  // CHECK: [[mad9:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad8]]) ; FMad(a,b,c)
+  // CHECK: [[mad10:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad9]]) ; FMad(a,b,c)
+  // CHECK: [[mad11:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad10]]) ; FMad(a,b,c)
+  // CHECK: [[mad12:%.*]] = call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float [[mad11]]) ; FMad(a,b,c)
+  fRes += dot(fVec1, fVec2);
+
+  // CHECK: call <13 x float> @dx.op.unary.v13f32(i32 17, <13 x float> [[fvec1]])  ; Atan(value)
+  fRes += atan(fVec1);
+
+  // CHECK: call <13 x i32> @dx.op.binary.v13i32(i32 40, <13 x i32> [[uvec1]], <13 x i32> [[uvec2]])  ; UMin(a,b)
+  uRes += min(uVec1, uVec2);
+
+  // CHECK: call <13 x float> @dx.op.tertiary.v13f32(i32 46, <13 x float> [[fvec1]], <13 x float> [[fvec2]], <13 x float> [[fvec3]])  ; FMad(a,b,c)
+  fRes += mad(fVec1, fVec2, fVec3);
+
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 24, i32 0, i32 8) 
+  // CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 25, i32 0, i32 8) 
+  // CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  // CHECK: [[ld:%.*]] = call %dx.types.ResRet.v13f64 @dx.op.rawBufferVectorLoad.v13f64(i32 303, %dx.types.Handle {{%.*}}, i32 26, i32 0, i32 8) 
+  // CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.v13f64 [[ld]], 0
+  vector<double, NUM> dVec1 = dBuf[24];
+  vector<double, NUM> dVec2 = dBuf[25];
+  vector<double, NUM> dVec3 = dBuf[26];
+
+  // CHECK: call <13 x double> @dx.op.tertiary.v13f64(i32 47, <13 x double> [[dvec1]], <13 x double> [[dvec2]], <13 x double> [[dvec3]])
+  vector<double, NUM> dRes = fma(dVec1, dVec2, dVec3);
+
+  hBuf[0] = hRes;
+  fBuf[0] = fRes;
+  dBuf[0] = dRes;
+  uBuf[0] = uRes;
+  lBuf[0] = lRes;
+}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll
new file mode 100644
index 0000000000..8f9dcbbdbc
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-intrinsics.ll
@@ -0,0 +1,434 @@
+; RUN: %dxopt %s -dxilgen -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:8-i16:16-i32:32-i64:64-f16:16-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%"class.RWStructuredBuffer<vector<half, 7> >" = type { <7 x half> }
+%"class.RWStructuredBuffer<vector<float, 7> >" = type { <7 x float> }
+%"class.RWStructuredBuffer<vector<double, 7> >" = type { <7 x double> }
+%"class.RWStructuredBuffer<vector<bool, 7> >" = type { <7 x i32> }
+%"class.RWStructuredBuffer<vector<unsigned int, 7> >" = type { <7 x i32> }
+%"class.RWStructuredBuffer<vector<long long, 7> >" = type { <7 x i64> }
+%dx.types.Handle = type { i8* }
+%dx.types.ResourceProperties = type { i32, i32 }
+
+@"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" = external global %"class.RWStructuredBuffer<vector<half, 7> >", align 2
+@"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" = external global %"class.RWStructuredBuffer<vector<float, 7> >", align 4
+@"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" = external global %"class.RWStructuredBuffer<vector<double, 7> >", align 8
+@"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" = external global %"class.RWStructuredBuffer<vector<bool, 7> >", align 4
+@"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" = external global %"class.RWStructuredBuffer<vector<unsigned int, 7> >", align 4
+@"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" = external global %"class.RWStructuredBuffer<vector<long long, 7> >", align 8
+
+; CHECK-LABEL: define void @main()
+define void @main() #0 {
+bb:
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 11, i32 0, i32 4)
+  ; CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 12, i32 0, i32 4)
+  ; CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f32 @dx.op.rawBufferVectorLoad.v7f32(i32 303, %dx.types.Handle {{%.*}}, i32 13, i32 0, i32 4)
+  ; CHECK: [[fvec3:%.*]] = extractvalue %dx.types.ResRet.v7f32 [[ld]], 0
+
+  %exp = alloca <7 x float>, align 4
+  %tmp = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:23 col:30
+  %tmp1 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp) ; line:23 col:30
+  %tmp2 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp1, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:23 col:30
+  %tmp3 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp2, i32 11) ; line:23 col:30
+  %tmp4 = load <7 x float>, <7 x float>* %tmp3 ; line:23 col:30
+  %tmp5 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:24 col:30
+  %tmp6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp5) ; line:24 col:30
+  %tmp7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:24 col:30
+  %tmp8 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp7, i32 12) ; line:24 col:30
+  %tmp9 = load <7 x float>, <7 x float>* %tmp8 ; line:24 col:30
+  %tmp10 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:25 col:30
+  %tmp11 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp10) ; line:25 col:30
+  %tmp12 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp11, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:25 col:30
+  %tmp13 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp12, i32 13) ; line:25 col:30
+  %tmp14 = load <7 x float>, <7 x float>* %tmp13 ; line:25 col:30
+
+  ;  Clamp operation.
+  ; CHECK: [[max:%.*]] = call <7 x float> @dx.op.binary.v7f32(i32 35, <7 x float> [[fvec1]], <7 x float> [[fvec2]])
+  ; CHECK: call <7 x float> @dx.op.binary.v7f32(i32 36, <7 x float> [[max]], <7 x float> [[fvec3]])
+  %tmp15 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 119, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:29 col:29
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 14, i32 0, i32 2)
+  ; CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 15, i32 0, i32 2)
+  ; CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f16 @dx.op.rawBufferVectorLoad.v7f16(i32 303, %dx.types.Handle {{%.*}}, i32 16, i32 0, i32 2)
+  ; CHECK: [[hvec3:%.*]] = extractvalue %dx.types.ResRet.v7f16 [[ld]], 0
+  %tmp16 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:37 col:34
+  %tmp17 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp16) ; line:37 col:34
+  %tmp18 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp17, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:37 col:34
+  %tmp19 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp18, i32 14) ; line:37 col:34
+  %tmp20 = load <7 x half>, <7 x half>* %tmp19 ; line:37 col:34
+  %tmp21 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:38 col:34
+  %tmp22 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp21) ; line:38 col:34
+  %tmp23 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp22, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:38 col:34
+  %tmp24 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp23, i32 15) ; line:38 col:34
+  %tmp25 = load <7 x half>, <7 x half>* %tmp24 ; line:38 col:34
+  %tmp26 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:39 col:34
+  %tmp27 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp26) ; line:39 col:34
+  %tmp28 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp27, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:39 col:34
+  %tmp29 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp28, i32 16) ; line:39 col:34
+  %tmp30 = load <7 x half>, <7 x half>* %tmp29 ; line:39 col:34
+
+  ; Step operation.
+  ; CHECK: [[cmp:%.*]] = fcmp fast olt <7 x half> [[hvec2]], [[hvec1]]
+  ; CHECK: select <7 x i1> [[cmp]], <7 x half> zeroinitializer, <7 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>
+  %tmp31 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>)"(i32 192, <7 x half> %tmp20, <7 x half> %tmp25) ; line:43 col:33
+
+  ;  Exp operation.
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> <float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000, float 0x3FF7154760000000>, [[fvec1]]
+  ; CHECK call <7 x float> @dx.op.unary.v7f32(i32 21, <7 x float> [[mul]])
+  %tmp32 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 139, <7 x float> %tmp4) ; line:47 col:11
+  %tmp33 = fadd <7 x float> %tmp15, %tmp32 ; line:47 col:8
+
+  ;  Log operation.
+  ; CHECK: [[log:%.*]] = call <7 x half> @dx.op.unary.v7f16(i32 23, <7 x half> [[hvec1]])
+  ; CHECK: fmul fast <7 x half> <half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C, half 0xH398C>, [[log]]
+  %tmp34 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>)"(i32 159, <7 x half> %tmp20) ; line:51 col:11
+  %tmp35 = fadd <7 x half> %tmp31, %tmp34 ; line:51 col:8
+
+  ; Smoothstep operation.
+  ; CHECK: [[sub1:%.*]] = fsub fast <7 x float> [[fvec2]], [[fvec1]]
+  ; CHECK: [[sub2:%.*]] = fsub fast <7 x float> [[fvec3]], [[fvec1]]
+  ; CHECK: [[div:%.*]] = fdiv fast <7 x float> [[sub2]], [[sub1]]
+  ; CHECK: [[sat:%.*]] = call <7 x float> @dx.op.unary.v7f32(i32 7, <7 x float> [[div]])
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> [[sat]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+  ; CHECK: [[sub:%.*]] = fsub fast <7 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, [[mul]]
+  ; CHECK: [[mul:%.*]] = fmul fast <7 x float> [[sat]], [[sub]]
+  ; CHECK: fmul fast <7 x float> %Saturate, [[mul]]
+  %tmp36 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 189, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:61 col:11
+  %tmp37 = fadd <7 x float> %tmp33, %tmp36 ; line:61 col:8
+
+  ;  Radians operation.
+  ; CHECK: fmul fast <7 x float> <float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000, float 0x3F91DF46A0000000>, [[fvec3]]
+  %tmp38 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 176, <7 x float> %tmp14) ; line:66 col:11
+  %tmp39 = fadd <7 x float> %tmp37, %tmp38 ; line:66 col:8
+  store <7 x float> %tmp14, <7 x float>* %exp, align 4 ; line:77 col:22
+
+  ;  Frexp operation.
+  ; CHECK: [[cmp:%.*]] = fcmp fast une <7 x float> [[fvec1]], zeroinitializer
+  ; CHECK: [[ext:%.*]] = sext <7 x i1> [[cmp]] to <7 x i32>
+  ; CHECK: [[bct:%.*]] = bitcast <7 x float> [[fvec1]] to <7 x i32>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[bct]], <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  ; CHECK: [[add:%.*]] = add <7 x i32> [[and]], <i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608, i32 -1056964608>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[add]], [[ext]]
+  ; CHECK: [[shr:%.*]] = ashr <7 x i32> [[and]], <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  ; CHECK: [[i2f:%.*]] = sitofp <7 x i32> [[shr]] to <7 x float>
+  ; CHECK: store <7 x float> [[i2f]], <7 x float>* %exp
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[bct]], <i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  ; CHECK: [[or:%.*]] = or <7 x i32> [[and]], <i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608, i32 1056964608>
+  ; CHECK: [[and:%.*]] = and <7 x i32> [[or]], [[ext]]
+  ; CHECK: bitcast <7 x i32> [[and]] to <7 x float>
+  %tmp41 = call <7 x float> @"dx.hl.op..<7 x float> (i32, <7 x float>, <7 x float>*)"(i32 150, <7 x float> %tmp4, <7 x float>* %exp) ; line:78 col:11
+  %tmp42 = fadd <7 x float> %tmp39, %tmp41 ; line:78 col:8
+  %tmp43 = load <7 x float>, <7 x float>* %exp, align 4 ; line:79 col:11
+  %tmp44 = fadd <7 x float> %tmp42, %tmp43 ; line:79 col:8
+
+  ;  Lerp operation.
+  ; CHECK: [[sub:%.*]] = fsub fast <7 x half> [[hvec3]], [[hvec2]]
+  ; CHECK: fmul fast <7 x half> [[hvec1]], [[sub]]
+  %tmp45 = call <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>, <7 x half>)"(i32 157, <7 x half> %tmp25, <7 x half> %tmp30, <7 x half> %tmp20) ; line:83 col:11
+  %tmp46 = fadd <7 x half> %tmp35, %tmp45 ; line:83 col:8
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 17, i32 0, i32 4)
+  ; CHECK: [[uvec1:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 18, i32 0, i32 4)
+  ; CHECK: [[uvec2:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  %tmp47 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:90 col:29
+  %tmp48 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp47) ; line:90 col:29
+  %tmp49 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp48, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:90 col:29
+  %tmp50 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp49, i32 17) ; line:90 col:29
+  %tmp51 = load <7 x i32>, <7 x i32>* %tmp50 ; line:90 col:29
+  %tmp52 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:91 col:29
+  %tmp53 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp52) ; line:91 col:29
+  %tmp54 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp53, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:91 col:29
+  %tmp55 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp54, i32 18) ; line:91 col:29
+  %tmp56 = load <7 x i32>, <7 x i32>* %tmp55 ; line:91 col:29
+
+  ; Unsigned int sign operation.
+  ; CHECK: [[cmp:%.*]] = icmp ne <7 x i32> [[uvec2]], zeroinitializer
+  ; CHECK: zext <7 x i1> [[cmp]] to <7 x i32>
+  %tmp57 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>)"(i32 355, <7 x i32> %tmp56) ; line:96 col:12
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i64 @dx.op.rawBufferVectorLoad.v7i64(i32 303, %dx.types.Handle {{%.*}}, i32 19, i32 0, i32 8)
+  ; CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.v7i64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i64 @dx.op.rawBufferVectorLoad.v7i64(i32 303, %dx.types.Handle {{%.*}}, i32 20, i32 0, i32 8)
+  ; CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.v7i64 [[ld]], 0
+  %tmp58 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:102 col:32
+  %tmp59 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp58) ; line:102 col:32
+  %tmp60 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp59, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:102 col:32
+  %tmp61 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp60, i32 19) ; line:102 col:32
+  %tmp62 = load <7 x i64>, <7 x i64>* %tmp61 ; line:102 col:32
+  %tmp63 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:103 col:32
+  %tmp64 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp63) ; line:103 col:32
+  %tmp65 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp64, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:103 col:32
+  %tmp66 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp65, i32 20) ; line:103 col:32
+  %tmp67 = load <7 x i64>, <7 x i64>* %tmp66 ; line:103 col:32
+
+  ; Signed int sign operation.
+  ; CHECK: [[lt1:%.*]] = icmp slt <7 x i64> zeroinitializer, [[lvec2]]
+  ; CHECK: [[lt2:%.*]] = icmp slt <7 x i64> [[lvec2]], zeroinitializer
+  ; CHECK: [[ilt1:%.*]] = zext <7 x i1> [[lt1]] to <7 x i32>
+  ; CHECK: [[ilt2:%.*]] = zext <7 x i1> [[lt2]] to <7 x i32>
+  ; CHECK: sub <7 x i32> [[ilt1]], [[ilt2]]
+  %tmp68 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i64>)"(i32 185, <7 x i64> %tmp67) ; line:110 col:12
+  %tmp69 = mul <7 x i32> %tmp57, %tmp68 ; line:110 col:9
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 21, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec1:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 22, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec2:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7i32 @dx.op.rawBufferVectorLoad.v7i32(i32 303, %dx.types.Handle {{%.*}}, i32 23, i32 0, i32 4) 
+  ; CHECK: [[vec:%.*]] = extractvalue %dx.types.ResRet.v7i32 [[ld]], 0
+  ; CHECK: [[bvec:%.*]] = icmp ne <7 x i32> [[vec]], zeroinitializer
+  ; CHECK: [[vec3:%.*]] = zext <7 x i1> [[bvec]] to <7 x i32>
+  %tmp70 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:126 col:29
+  %tmp71 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp70) ; line:126 col:29
+  %tmp72 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp71, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:126 col:29
+  %tmp73 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp72, i32 21) ; line:126 col:29
+  %tmp74 = load <7 x i32>, <7 x i32>* %tmp73 ; line:126 col:29
+  %tmp75 = icmp ne <7 x i32> %tmp74, zeroinitializer ; line:126 col:29
+  %tmp76 = zext <7 x i1> %tmp75 to <7 x i32> ; line:126 col:21
+  %tmp77 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:127 col:29
+  %tmp78 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp77) ; line:127 col:29
+  %tmp79 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp78, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:127 col:29
+  %tmp80 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp79, i32 22) ; line:127 col:29
+  %tmp81 = load <7 x i32>, <7 x i32>* %tmp80 ; line:127 col:29
+  %tmp82 = icmp ne <7 x i32> %tmp81, zeroinitializer ; line:127 col:29
+  %tmp83 = zext <7 x i1> %tmp82 to <7 x i32> ; line:127 col:21
+  %tmp84 = load %"class.RWStructuredBuffer<vector<bool, 7> >", %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A" ; line:128 col:29
+  %tmp85 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<bool, 7> >" %tmp84) ; line:128 col:29
+  %tmp86 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32 14, %dx.types.Handle %tmp85, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<bool, 7> >" zeroinitializer) ; line:128 col:29
+  %tmp87 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp86, i32 23) ; line:128 col:29
+  %tmp88 = load <7 x i32>, <7 x i32>* %tmp87 ; line:128 col:29
+  %tmp89 = icmp ne <7 x i32> %tmp88, zeroinitializer ; line:128 col:29
+  %tmp90 = zext <7 x i1> %tmp89 to <7 x i32> ; line:128 col:21
+
+
+  ; Or() operation.
+  ; CHECK: [[bvec2:%.*]] = icmp ne <7 x i32> [[vec2]], zeroinitializer
+  ; CHECK: [[bvec1:%.*]] = icmp ne <7 x i32> [[vec1]], zeroinitializer
+  ; CHECK: or <7 x i1> [[bvec1]], [[bvec2]]
+  %tmp91 = icmp ne <7 x i32> %tmp83, zeroinitializer ; line:133 col:21
+  %tmp92 = icmp ne <7 x i32> %tmp76, zeroinitializer ; line:133 col:14
+  %tmp93 = call <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32 169, <7 x i1> %tmp92, <7 x i1> %tmp91) ; line:133 col:11
+  %tmp94 = zext <7 x i1> %tmp93 to <7 x i32> ; line:133 col:11
+  %tmp95 = add <7 x i32> %tmp69, %tmp94 ; line:133 col:8
+
+  ; And() operation.
+  ; CHECK: [[bvec3:%.*]] = icmp ne <7 x i32> [[vec3]], zeroinitializer
+  ; CHECK: [[bvec2:%.*]] = icmp ne <7 x i32> [[vec2]], zeroinitializer
+  ; CHECK: and <7 x i1> [[bvec2]], [[bvec3]]
+  %tmp96 = icmp ne <7 x i32> %tmp90, zeroinitializer ; line:137 col:22
+  %tmp97 = icmp ne <7 x i32> %tmp83, zeroinitializer ; line:137 col:15
+  %tmp98 = call <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32 106, <7 x i1> %tmp97, <7 x i1> %tmp96) ; line:137 col:11
+  %tmp99 = zext <7 x i1> %tmp98 to <7 x i32> ; line:137 col:11
+  %tmp100 = add <7 x i32> %tmp95, %tmp99 ; line:137 col:8
+
+  ; Select() operation.
+  ; CHECK: [[bvec3:%.*]] = icmp ne <7 x i32> [[vec3]], zeroinitializer
+  ; CHECK: select <7 x i1> [[bvec3]], <7 x i64> [[lvec1]], <7 x i64> [[lvec2]]
+  %tmp101 = icmp ne <7 x i32> %tmp90, zeroinitializer ; line:140 col:38
+  %tmp102 = call <7 x i64> @"dx.hl.op.rn.<7 x i64> (i32, <7 x i1>, <7 x i64>, <7 x i64>)"(i32 184, <7 x i1> %tmp101, <7 x i64> %tmp62, <7 x i64> %tmp67) ; line:140 col:31
+  %tmp103 = call float @"dx.hl.op.rn.float (i32, <7 x float>, <7 x float>)"(i32 134, <7 x float> %tmp4, <7 x float> %tmp9) ; line:152 col:11
+
+  ; Dot operation.
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 0
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 0
+  ; CHECK: [[mul:%.*]] = fmul fast float [[el1]], [[el2]]
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 1
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 1
+  ; CHECK: [[mad1:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mul]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 2
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 2
+  ; CHECK: [[mad2:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad1]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 3
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 3
+  ; CHECK: [[mad3:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad2]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 4
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 4
+  ; CHECK: [[mad4:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad3]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 5
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 5
+  ; CHECK: [[mad5:%.*]] = call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad4]])
+  ; CHECK: [[el1:%.*]] = extractelement <7 x float> [[fvec1]], i64 6
+  ; CHECK: [[el2:%.*]] = extractelement <7 x float> [[fvec2]], i64 6
+  ; CHECK: call float @dx.op.tertiary.f32(i32 46, float [[el1]], float [[el2]], float [[mad5]])
+  %tmp104 = insertelement <7 x float> undef, float %tmp103, i32 0 ; line:152 col:11
+  %tmp105 = shufflevector <7 x float> %tmp104, <7 x float> undef, <7 x i32> zeroinitializer ; line:152 col:11
+  %tmp106 = fadd <7 x float> %tmp44, %tmp105 ; line:152 col:8
+
+  ; Atan operation.
+  ; CHECK: call <7 x float> @dx.op.unary.v7f32(i32 17, <7 x float> [[fvec1]])
+  %tmp107 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32 116, <7 x float> %tmp4) ; line:155 col:11
+  %tmp108 = fadd <7 x float> %tmp106, %tmp107 ; line:155 col:8
+
+  ; Min operation.
+  ; CHECK: call <7 x i32> @dx.op.binary.v7i32(i32 40, <7 x i32> [[uvec1]], <7 x i32> [[uvec2]])
+  %tmp109 = call <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>, <7 x i32>)"(i32 353, <7 x i32> %tmp51, <7 x i32> %tmp56) ; line:158 col:11
+  %tmp110 = add <7 x i32> %tmp100, %tmp109 ; line:158 col:8
+
+  ; Mad operation.
+  ; CHECK: call <7 x float> @dx.op.tertiary.v7f32(i32 46, <7 x float> [[fvec1]], <7 x float> [[fvec2]], <7 x float> [[fvec3]])
+  %tmp111 = call <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32 162, <7 x float> %tmp4, <7 x float> %tmp9, <7 x float> %tmp14) ; line:161 col:11
+  %tmp112 = fadd <7 x float> %tmp108, %tmp111 ; line:161 col:8
+
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 24, i32 0, i32 8) 
+  ; CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 25, i32 0, i32 8) 
+  ; CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  ; CHECK: [[ld:%.*]] = call %dx.types.ResRet.v7f64 @dx.op.rawBufferVectorLoad.v7f64(i32 303, %dx.types.Handle {{%.*}}, i32 26, i32 0, i32 8) 
+  ; CHECK: [[dvec3:%.*]] = extractvalue %dx.types.ResRet.v7f64 [[ld]], 0
+  %tmp113 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:169 col:31
+  %tmp114 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp113) ; line:169 col:31
+  %tmp115 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp114, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:169 col:31
+  %tmp116 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp115, i32 24) ; line:169 col:31
+  %tmp117 = load <7 x double>, <7 x double>* %tmp116 ; line:169 col:31
+  %tmp118 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:170 col:31
+  %tmp119 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp118) ; line:170 col:31
+  %tmp120 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp119, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:170 col:31
+  %tmp121 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp120, i32 25) ; line:170 col:31
+  %tmp122 = load <7 x double>, <7 x double>* %tmp121 ; line:170 col:31
+  %tmp123 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:171 col:31
+  %tmp124 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp123) ; line:171 col:31
+  %tmp125 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp124, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:171 col:31
+  %tmp126 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp125, i32 26) ; line:171 col:31
+  %tmp127 = load <7 x double>, <7 x double>* %tmp126 ; line:171 col:31
+
+  ; FMA operation.
+  ; CHECK: call <7 x double> @dx.op.tertiary.v7f64(i32 47, <7 x double> [[dvec1]], <7 x double> [[dvec2]], <7 x double> [[dvec3]])
+  %tmp128 = call <7 x double> @"dx.hl.op.rn.<7 x double> (i32, <7 x double>, <7 x double>, <7 x double>)"(i32 147, <7 x double> %tmp117, <7 x double> %tmp122, <7 x double> %tmp127) ; line:174 col:30
+  %tmp129 = load %"class.RWStructuredBuffer<vector<half, 7> >", %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A" ; line:176 col:3
+  %tmp130 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" %tmp129) ; line:176 col:3
+  %tmp131 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32 14, %dx.types.Handle %tmp130, %dx.types.ResourceProperties { i32 4108, i32 14 }, %"class.RWStructuredBuffer<vector<half, 7> >" zeroinitializer) ; line:176 col:3
+  %tmp132 = call <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp131, i32 0) ; line:176 col:3
+  store <7 x half> %tmp46, <7 x half>* %tmp132 ; line:176 col:11
+  %tmp133 = load %"class.RWStructuredBuffer<vector<float, 7> >", %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A" ; line:177 col:3
+  %tmp134 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<float, 7> >" %tmp133) ; line:177 col:3
+  %tmp135 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32 14, %dx.types.Handle %tmp134, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<float, 7> >" zeroinitializer) ; line:177 col:3
+  %tmp136 = call <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp135, i32 0) ; line:177 col:3
+  store <7 x float> %tmp112, <7 x float>* %tmp136 ; line:177 col:11
+  %tmp137 = load %"class.RWStructuredBuffer<vector<double, 7> >", %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A" ; line:178 col:3
+  %tmp138 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<double, 7> >" %tmp137) ; line:178 col:3
+  %tmp139 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32 14, %dx.types.Handle %tmp138, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<double, 7> >" zeroinitializer) ; line:178 col:3
+  %tmp140 = call <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp139, i32 0) ; line:178 col:3
+  store <7 x double> %tmp128, <7 x double>* %tmp140 ; line:178 col:11
+  %tmp141 = load %"class.RWStructuredBuffer<vector<unsigned int, 7> >", %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A" ; line:179 col:3
+  %tmp142 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" %tmp141) ; line:179 col:3
+  %tmp143 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32 14, %dx.types.Handle %tmp142, %dx.types.ResourceProperties { i32 4108, i32 28 }, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" zeroinitializer) ; line:179 col:3
+  %tmp144 = call <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp143, i32 0) ; line:179 col:3
+  store <7 x i32> %tmp110, <7 x i32>* %tmp144 ; line:179 col:11
+  %tmp145 = load %"class.RWStructuredBuffer<vector<long long, 7> >", %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A" ; line:180 col:3
+  %tmp146 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 0, %"class.RWStructuredBuffer<vector<long long, 7> >" %tmp145) ; line:180 col:3
+  %tmp147 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32 14, %dx.types.Handle %tmp146, %dx.types.ResourceProperties { i32 4108, i32 56 }, %"class.RWStructuredBuffer<vector<long long, 7> >" zeroinitializer) ; line:180 col:3
+  %tmp148 = call <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32 0, %dx.types.Handle %tmp147, i32 0) ; line:180 col:3
+  store <7 x i64> %tmp102, <7 x i64>* %tmp148 ; line:180 col:11
+  ret void ; line:181 col:1
+}
+
+declare <7 x float>* @"dx.hl.subscript.[].rn.<7 x float>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<float, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<float, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<float, 7> >") #1
+declare <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>, <7 x float>, <7 x float>)"(i32, <7 x float>, <7 x float>, <7 x float>) #1
+declare <7 x half>* @"dx.hl.subscript.[].rn.<7 x half>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<half, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<half, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<half, 7> >") #1
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>)"(i32, <7 x half>, <7 x half>) #1
+declare <7 x float> @"dx.hl.op.rn.<7 x float> (i32, <7 x float>)"(i32, <7 x float>) #1
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>)"(i32, <7 x half>) #1
+declare <7 x float> @"dx.hl.op..<7 x float> (i32, <7 x float>, <7 x float>*)"(i32, <7 x float>, <7 x float>*) #0
+declare <7 x half> @"dx.hl.op.rn.<7 x half> (i32, <7 x half>, <7 x half>, <7 x half>)"(i32, <7 x half>, <7 x half>, <7 x half>) #1
+declare <7 x i32>* @"dx.hl.subscript.[].rn.<7 x i32>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<unsigned int, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<unsigned int, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<unsigned int, 7> >") #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>)"(i32, <7 x i32>) #1
+declare <7 x i64>* @"dx.hl.subscript.[].rn.<7 x i64>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<long long, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<long long, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<long long, 7> >") #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i64>)"(i32, <7 x i64>) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<bool, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<bool, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<bool, 7> >") #1
+declare <7 x i1> @"dx.hl.op.rn.<7 x i1> (i32, <7 x i1>, <7 x i1>)"(i32, <7 x i1>, <7 x i1>) #1
+declare <7 x i64> @"dx.hl.op.rn.<7 x i64> (i32, <7 x i1>, <7 x i64>, <7 x i64>)"(i32, <7 x i1>, <7 x i64>, <7 x i64>) #1
+declare float @"dx.hl.op.rn.float (i32, <7 x float>, <7 x float>)"(i32, <7 x float>, <7 x float>) #1
+declare <7 x i32> @"dx.hl.op.rn.<7 x i32> (i32, <7 x i32>, <7 x i32>)"(i32, <7 x i32>, <7 x i32>) #1
+declare <7 x double>* @"dx.hl.subscript.[].rn.<7 x double>* (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1
+declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32, %"class.RWStructuredBuffer<vector<double, 7> >") #1
+declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %\22class.RWStructuredBuffer<vector<double, 7> >\22)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %"class.RWStructuredBuffer<vector<double, 7> >") #1
+declare <7 x double> @"dx.hl.op.rn.<7 x double> (i32, <7 x double>, <7 x double>, <7 x double>)"(i32, <7 x double>, <7 x double>, <7 x double>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!pauseresume = !{!1}
+!dx.version = !{!3}
+!dx.valver = !{!3}
+!dx.shaderModel = !{!4}
+!dx.typeAnnotations = !{!5, !36}
+!dx.entryPoints = !{!40}
+!dx.fnprops = !{!52}
+!dx.options = !{!53, !54}
+
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!3 = !{i32 1, i32 9}
+!4 = !{!"cs", i32 6, i32 9}
+!5 = !{i32 0, %"class.RWStructuredBuffer<vector<half, 7> >" undef, !6, %"class.RWStructuredBuffer<vector<float, 7> >" undef, !11, %"class.RWStructuredBuffer<vector<double, 7> >" undef, !16, %"class.RWStructuredBuffer<vector<bool, 7> >" undef, !21, %"class.RWStructuredBuffer<vector<unsigned int, 7> >" undef, !26, %"class.RWStructuredBuffer<vector<long long, 7> >" undef, !31}
+!6 = !{i32 14, !7, !8}
+!7 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 8, i32 13, i32 7}
+!8 = !{i32 0, !9}
+!9 = !{!10}
+!10 = !{i32 0, <7 x half> undef}
+!11 = !{i32 28, !12, !13}
+!12 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 9, i32 13, i32 7}
+!13 = !{i32 0, !14}
+!14 = !{!15}
+!15 = !{i32 0, <7 x float> undef}
+!16 = !{i32 56, !17, !18}
+!17 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 10, i32 13, i32 7}
+!18 = !{i32 0, !19}
+!19 = !{!20}
+!20 = !{i32 0, <7 x double> undef}
+!21 = !{i32 28, !22, !23}
+!22 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 1, i32 13, i32 7}
+!23 = !{i32 0, !24}
+!24 = !{!25}
+!25 = !{i32 0, <7 x i1> undef}
+!26 = !{i32 28, !27, !28}
+!27 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 5, i32 13, i32 7}
+!28 = !{i32 0, !29}
+!29 = !{!30}
+!30 = !{i32 0, <7 x i32> undef}
+!31 = !{i32 56, !32, !33}
+!32 = !{i32 6, !"h", i32 3, i32 0, i32 7, i32 6, i32 13, i32 7}
+!33 = !{i32 0, !34}
+!34 = !{!35}
+!35 = !{i32 0, <7 x i64> undef}
+!36 = !{i32 1, void ()* @main, !37}
+!37 = !{!38}
+!38 = !{i32 1, !39, !39}
+!39 = !{}
+!40 = !{void ()* @main, !"main", null, !41, null}
+!41 = !{null, !42, null, null}
+!42 = !{!43, !45, !47, !49, !50, !51}
+!43 = !{i32 0, %"class.RWStructuredBuffer<vector<half, 7> >"* @"\01?hBuf@@3V?$RWStructuredBuffer@V?$vector@$f16@$06@@@@A", !"hBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !44}
+!44 = !{i32 1, i32 14}
+!45 = !{i32 1, %"class.RWStructuredBuffer<vector<float, 7> >"* @"\01?fBuf@@3V?$RWStructuredBuffer@V?$vector@M$06@@@@A", !"fBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!46 = !{i32 1, i32 28}
+!47 = !{i32 2, %"class.RWStructuredBuffer<vector<double, 7> >"* @"\01?dBuf@@3V?$RWStructuredBuffer@V?$vector@N$06@@@@A", !"dBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !48}
+!48 = !{i32 1, i32 56}
+!49 = !{i32 3, %"class.RWStructuredBuffer<vector<bool, 7> >"* @"\01?bBuf@@3V?$RWStructuredBuffer@V?$vector@_N$06@@@@A", !"bBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!50 = !{i32 4, %"class.RWStructuredBuffer<vector<unsigned int, 7> >"* @"\01?uBuf@@3V?$RWStructuredBuffer@V?$vector@I$06@@@@A", !"uBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !46}
+!51 = !{i32 5, %"class.RWStructuredBuffer<vector<long long, 7> >"* @"\01?lBuf@@3V?$RWStructuredBuffer@V?$vector@_J$06@@@@A", !"lBuf", i32 -1, i32 -1, i32 1, i32 12, i1 false, i1 false, i1 false, !48}
+!52 = !{void ()* @main, i32 5, i32 8, i32 1, i32 1}
+!53 = !{i32 0}
+!54 = !{i32 -1}
+!59 = !{!60, !60, i64 0}
+!60 = !{!"omnipotent char", !61, i64 0}
+!61 = !{!"Simple C/C++ TBAA"}