diff --git a/docs/DXIL.rst b/docs/DXIL.rst index a1c5055085..c77dfa184a 100644 --- a/docs/DXIL.rst +++ b/docs/DXIL.rst @@ -2419,6 +2419,10 @@ ID Name Description 302 ReservedC9 reserved 303 RawBufferVectorLoad reads from a raw buffer and structured buffer 304 RawBufferVectorStore writes to a RWByteAddressBuffer or RWStructuredBuffer +305 MatVecMul Multiplies a MxK dimension matrix and a K sized input vector +306 MatVecMulAdd multiplies a MxK dimension matrix and a K sized input vector and adds an M-sized bias vector +307 OuterProductAccumulate Computes the outer product between column vectors and an MxN matrix is accumulated component-wise atomically (with device scope) in memory +308 VectorAccumulate Accumulates the components of a vector component-wise atomically (with device scope) to the corresponding elements of an array in memory === ===================================================== ======================================================================================================================================================================================================================= @@ -3134,6 +3138,14 @@ INSTR.ILLEGALDXILOPCODE DXILOpCode must be [0..%0] INSTR.ILLEGALDXILOPFUNCTION '%0' is not a DXILOpFuncition for DXILOpcode '%1'. INSTR.IMMBIASFORSAMPLEB bias amount for sample_b must be in the range [%0,%1], but %2 was specified as an immediate. INSTR.INBOUNDSACCESS Access to out-of-bounds memory is disallowed. +INSTR.LINALGINTERPRETATIONPARAMARECONST In Linalg operations, Interpretation value is a constant. +INSTR.LINALGINVALIDMATRIXLAYOUTVALUEFORMATVECOPS Matrix Layout for Linalg Mul/MulAdd operation must be valid. +INSTR.LINALGINVALIDMEMORYINTERPVALUE In Memory Interpolation value must be valid. +INSTR.LINALGINVALIDREGISTERINTERPVALUE From Register Interpretation value must be valid. +INSTR.LINALGMATRIXLAYOUTNOTTRANSPOSABLE Row Major and Column Major matrix layouts are not transposable. +INSTR.LINALGMATRIXSHAPEPARAMSARECONST Matrix Layout, Dimensions and isTranspose are constants +INSTR.LINALGNOTANUNSIGNEDTYPE Unsigned flag set for a float signed type +INSTR.MATVECOPISUNSIGNEDFLAGSARECONST In Linalg Mul/MulAdd functions, IsUnsigned flag is a constant. INSTR.MAYREORDERTHREADUNDEFCOHERENCEHINTPARAM Use of undef coherence hint or num coherence hint bits in MaybeReorderThread. INSTR.MINPRECISIONNOTPRECISE Instructions marked precise may not refer to minprecision values. INSTR.MINPRECISONBITCAST Bitcast on minprecison types is not allowed. diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h index 8c73328fbd..7fa4875070 100644 --- a/include/dxc/DXIL/DxilConstants.h +++ b/include/dxc/DXIL/DxilConstants.h @@ -162,24 +162,32 @@ const unsigned kDxilMaxOloadDims = 2; enum class ComponentType : uint32_t { Invalid = 0, - I1, - I16, - U16, - I32, - U32, - I64, - U64, - F16, - F32, - F64, - SNormF16, - UNormF16, - SNormF32, - UNormF32, - SNormF64, - UNormF64, - PackedS8x32, - PackedU8x32, + I1 = 1, + I16 = 2, + U16 = 3, + I32 = 4, + U32 = 5, + I64 = 6, + U64 = 7, + F16 = 8, + F32 = 9, + F64 = 10, + SNormF16 = 11, + UNormF16 = 12, + SNormF32 = 13, + UNormF32 = 14, + SNormF64 = 15, + UNormF64 = 16, + PackedS8x32 = 17, + PackedU8x32 = 18, + + // BEGIN NEW FOR SM 6.9 + U8 = 19, + I8 = 20, + F8_E4M3 = 21, + F8_E5M2 = 22, + // END + LastEntry }; @@ -743,6 +751,19 @@ enum class OpCode : unsigned { CreateHandleForLib = 160, // create resource handle from resource struct for library + // Linear Algebra Operations + MatVecMul = + 305, // Multiplies a MxK dimension matrix and a K sized input vector + MatVecMulAdd = 306, // multiplies a MxK dimension matrix and a K sized input + // vector and adds an M-sized bias vector + OuterProductAccumulate = + 307, // Computes the outer product between column vectors and an MxN + // matrix is accumulated component-wise atomically (with device + // scope) in memory + VectorAccumulate = 308, // Accumulates the components of a vector + // component-wise atomically (with device scope) to + // the corresponding elements of an array in memory + // Mesh shader instructions EmitIndices = 169, // emit a primitive's vertex indices in a mesh shader GetMeshPayload = @@ -1060,7 +1081,7 @@ enum class OpCode : unsigned { NumOpCodes_Dxil_1_7 = 226, NumOpCodes_Dxil_1_8 = 258, - NumOpCodes = 305 // exclusive last value of enumeration + NumOpCodes = 309 // exclusive last value of enumeration }; // OPCODE-ENUM:END @@ -1201,6 +1222,12 @@ enum class OpCodeClass : unsigned { // Library create handle from resource struct (like HL intrinsic) CreateHandleForLib, + // Linear Algebra Operations + MatVecMul, + MatVecMulAdd, + OuterProductAccumulate, + VectorAccumulate, + // Mesh shader instructions EmitIndices, GetMeshPayload, @@ -1385,7 +1412,7 @@ enum class OpCodeClass : unsigned { NumOpClasses_Dxil_1_7 = 153, NumOpClasses_Dxil_1_8 = 174, - NumOpClasses = 190 // exclusive last value of enumeration + NumOpClasses = 194 // exclusive last value of enumeration }; // OPCODECLASS-ENUM:END @@ -1556,6 +1583,28 @@ const unsigned kMSStoreOutputColOpIdx = 3; const unsigned kMSStoreOutputVIdxOpIdx = 4; const unsigned kMSStoreOutputValOpIdx = 5; +// MatVec Ops +const unsigned kMatVecMulInputVectorIdx = 1; +const unsigned kMatVecMulIsInputUnsignedIdx = 2; +const unsigned kMatVecMulInputInterpretationIdx = 3; +const unsigned kMatVecMulMatrixBufferIdx = 4; +const unsigned kMatVecMulMatrixOffsetIdx = 5; +const unsigned kMatVecMulMatrixInterpretationIdx = 6; +const unsigned kMatVecMulMatrixMIdx = 7; +const unsigned kMatVecMulMatrixKIdx = 8; +const unsigned kMatVecMulMatrixLayoutIdx = 9; +const unsigned kMatVecMulMatrixTransposeIdx = 10; +const unsigned kMatVecMulMatrixStrideIdx = 11; +const unsigned kMatVecMulIsOutputUnsignedIdx = 12; + +// MatVecAdd +const unsigned kMatVecMulAddBiasInterpretation = 14; +const unsigned kMatVecMulAddIsOutputUnsignedIdx = 15; + +// Outer Product Accumulate +const unsigned kOuterProdAccMatrixInterpretation = 5; +const unsigned kOuterProdAccMatrixLayout = 6; + // TODO: add operand index for all the OpCodeClass. } // namespace OperandIndex @@ -2127,6 +2176,13 @@ extern const char *kHostLayoutTypePrefix; extern const char *kWaveOpsIncludeHelperLanesString; +enum class LinalgMatrixLayout : uint32_t { + RowMajor = 0, + ColumnMajor = 1, + MulOptimal = 2, + OuterProductOptimal = 3, +}; + } // namespace DXIL } // namespace hlsl diff --git a/include/dxc/DXIL/DxilInstructions.h b/include/dxc/DXIL/DxilInstructions.h index a99c5360d4..9a4030fd8e 100644 --- a/include/dxc/DXIL/DxilInstructions.h +++ b/include/dxc/DXIL/DxilInstructions.h @@ -9918,5 +9918,235 @@ struct DxilInst_RawBufferVectorStore { llvm::APInt(32, (uint64_t)val))); } }; + +/// This instruction Multiplies a MxK dimension matrix and a K sized input +/// vector +struct DxilInst_MatVecMul { + llvm::Instruction *Instr; + // Construction and identification + DxilInst_MatVecMul(llvm::Instruction *pInstr) : Instr(pInstr) {} + operator bool() const { + return hlsl::OP::IsDxilOpFuncCallInst(Instr, hlsl::OP::OpCode::MatVecMul); + } + // Validation support + bool isAllowed() const { return true; } + bool isArgumentListValid() const { + if (13 != llvm::dyn_cast(Instr)->getNumArgOperands()) + return false; + return true; + } + // Metadata + bool requiresUniformInputs() const { return false; } + // Operand indexes + enum OperandIdx { + arg_inputVector = 1, + arg_isInputUnsigned = 2, + arg_inputInterpretation = 3, + arg_matrixBuffer = 4, + arg_matrixOffset = 5, + arg_matrixIntepretation = 6, + arg_matrixM = 7, + arg_matrixK = 8, + arg_matrixLayout = 9, + arg_matrixTranspose = 10, + arg_matrixStride = 11, + arg_isOutputUnsigned = 12, + }; + // Accessors + llvm::Value *get_inputVector() const { return Instr->getOperand(1); } + void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); } + llvm::Value *get_isInputUnsigned() const { return Instr->getOperand(2); } + void set_isInputUnsigned(llvm::Value *val) { Instr->setOperand(2, val); } + llvm::Value *get_inputInterpretation() const { return Instr->getOperand(3); } + void set_inputInterpretation(llvm::Value *val) { Instr->setOperand(3, val); } + llvm::Value *get_matrixBuffer() const { return Instr->getOperand(4); } + void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(4, val); } + llvm::Value *get_matrixOffset() const { return Instr->getOperand(5); } + void set_matrixOffset(llvm::Value *val) { Instr->setOperand(5, val); } + llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(6); } + void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(6, val); } + llvm::Value *get_matrixM() const { return Instr->getOperand(7); } + void set_matrixM(llvm::Value *val) { Instr->setOperand(7, val); } + llvm::Value *get_matrixK() const { return Instr->getOperand(8); } + void set_matrixK(llvm::Value *val) { Instr->setOperand(8, val); } + llvm::Value *get_matrixLayout() const { return Instr->getOperand(9); } + void set_matrixLayout(llvm::Value *val) { Instr->setOperand(9, val); } + llvm::Value *get_matrixTranspose() const { return Instr->getOperand(10); } + void set_matrixTranspose(llvm::Value *val) { Instr->setOperand(10, val); } + llvm::Value *get_matrixStride() const { return Instr->getOperand(11); } + void set_matrixStride(llvm::Value *val) { Instr->setOperand(11, val); } + llvm::Value *get_isOutputUnsigned() const { return Instr->getOperand(12); } + void set_isOutputUnsigned(llvm::Value *val) { Instr->setOperand(12, val); } +}; + +/// This instruction multiplies a MxK dimension matrix and a K sized input +/// vector and adds an M-sized bias vector +struct DxilInst_MatVecMulAdd { + llvm::Instruction *Instr; + // Construction and identification + DxilInst_MatVecMulAdd(llvm::Instruction *pInstr) : Instr(pInstr) {} + operator bool() const { + return hlsl::OP::IsDxilOpFuncCallInst(Instr, + hlsl::OP::OpCode::MatVecMulAdd); + } + // Validation support + bool isAllowed() const { return true; } + bool isArgumentListValid() const { + if (16 != llvm::dyn_cast(Instr)->getNumArgOperands()) + return false; + return true; + } + // Metadata + bool requiresUniformInputs() const { return false; } + // Operand indexes + enum OperandIdx { + arg_inputVector = 1, + arg_isInputUnsigned = 2, + arg_inputInterpretation = 3, + arg_matrixBuffer = 4, + arg_matrixOffset = 5, + arg_matrixIntepretation = 6, + arg_matrixM = 7, + arg_matrixK = 8, + arg_matrixLayout = 9, + arg_matrixTranspose = 10, + arg_matrixStride = 11, + arg_biasBuffer = 12, + arg_biasOffset = 13, + arg_biasIntepretation = 14, + arg_isOutputUnsigned = 15, + }; + // Accessors + llvm::Value *get_inputVector() const { return Instr->getOperand(1); } + void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); } + llvm::Value *get_isInputUnsigned() const { return Instr->getOperand(2); } + void set_isInputUnsigned(llvm::Value *val) { Instr->setOperand(2, val); } + llvm::Value *get_inputInterpretation() const { return Instr->getOperand(3); } + void set_inputInterpretation(llvm::Value *val) { Instr->setOperand(3, val); } + llvm::Value *get_matrixBuffer() const { return Instr->getOperand(4); } + void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(4, val); } + llvm::Value *get_matrixOffset() const { return Instr->getOperand(5); } + void set_matrixOffset(llvm::Value *val) { Instr->setOperand(5, val); } + llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(6); } + void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(6, val); } + llvm::Value *get_matrixM() const { return Instr->getOperand(7); } + void set_matrixM(llvm::Value *val) { Instr->setOperand(7, val); } + llvm::Value *get_matrixK() const { return Instr->getOperand(8); } + void set_matrixK(llvm::Value *val) { Instr->setOperand(8, val); } + llvm::Value *get_matrixLayout() const { return Instr->getOperand(9); } + void set_matrixLayout(llvm::Value *val) { Instr->setOperand(9, val); } + llvm::Value *get_matrixTranspose() const { return Instr->getOperand(10); } + void set_matrixTranspose(llvm::Value *val) { Instr->setOperand(10, val); } + llvm::Value *get_matrixStride() const { return Instr->getOperand(11); } + void set_matrixStride(llvm::Value *val) { Instr->setOperand(11, val); } + llvm::Value *get_biasBuffer() const { return Instr->getOperand(12); } + void set_biasBuffer(llvm::Value *val) { Instr->setOperand(12, val); } + llvm::Value *get_biasOffset() const { return Instr->getOperand(13); } + void set_biasOffset(llvm::Value *val) { Instr->setOperand(13, val); } + llvm::Value *get_biasIntepretation() const { return Instr->getOperand(14); } + void set_biasIntepretation(llvm::Value *val) { Instr->setOperand(14, val); } + llvm::Value *get_isOutputUnsigned() const { return Instr->getOperand(15); } + void set_isOutputUnsigned(llvm::Value *val) { Instr->setOperand(15, val); } +}; + +/// This instruction Computes the outer product between column vectors and an +/// MxN matrix is accumulated component-wise atomically (with device scope) in +/// memory +struct DxilInst_OuterProductAccumulate { + llvm::Instruction *Instr; + // Construction and identification + DxilInst_OuterProductAccumulate(llvm::Instruction *pInstr) : Instr(pInstr) {} + operator bool() const { + return hlsl::OP::IsDxilOpFuncCallInst( + Instr, hlsl::OP::OpCode::OuterProductAccumulate); + } + // Validation support + bool isAllowed() const { return true; } + bool isArgumentListValid() const { + if (8 != llvm::dyn_cast(Instr)->getNumArgOperands()) + return false; + return true; + } + // Metadata + bool requiresUniformInputs() const { return false; } + // Operand indexes + enum OperandIdx { + arg_inputVector1 = 1, + arg_inputVector2 = 2, + arg_matrixBuffer = 3, + arg_matrixOffset = 4, + arg_matrixIntepretation = 5, + arg_matrixLayout = 6, + arg_matrixStride = 7, + }; + // Accessors + llvm::Value *get_inputVector1() const { return Instr->getOperand(1); } + void set_inputVector1(llvm::Value *val) { Instr->setOperand(1, val); } + llvm::Value *get_inputVector2() const { return Instr->getOperand(2); } + void set_inputVector2(llvm::Value *val) { Instr->setOperand(2, val); } + llvm::Value *get_matrixBuffer() const { return Instr->getOperand(3); } + void set_matrixBuffer(llvm::Value *val) { Instr->setOperand(3, val); } + llvm::Value *get_matrixOffset() const { return Instr->getOperand(4); } + void set_matrixOffset(llvm::Value *val) { Instr->setOperand(4, val); } + llvm::Value *get_matrixIntepretation() const { return Instr->getOperand(5); } + void set_matrixIntepretation(llvm::Value *val) { Instr->setOperand(5, val); } + int32_t get_matrixIntepretation_val() const { + return (int32_t)(llvm::dyn_cast(Instr->getOperand(5)) + ->getZExtValue()); + } + void set_matrixIntepretation_val(int32_t val) { + Instr->setOperand(5, llvm::Constant::getIntegerValue( + llvm::IntegerType::get(Instr->getContext(), 32), + llvm::APInt(32, (uint64_t)val))); + } + llvm::Value *get_matrixLayout() const { return Instr->getOperand(6); } + void set_matrixLayout(llvm::Value *val) { Instr->setOperand(6, val); } + int32_t get_matrixLayout_val() const { + return (int32_t)(llvm::dyn_cast(Instr->getOperand(6)) + ->getZExtValue()); + } + void set_matrixLayout_val(int32_t val) { + Instr->setOperand(6, llvm::Constant::getIntegerValue( + llvm::IntegerType::get(Instr->getContext(), 32), + llvm::APInt(32, (uint64_t)val))); + } + llvm::Value *get_matrixStride() const { return Instr->getOperand(7); } + void set_matrixStride(llvm::Value *val) { Instr->setOperand(7, val); } +}; + +/// This instruction Accumulates the components of a vector component-wise +/// atomically (with device scope) to the corresponding elements of an array in +/// memory +struct DxilInst_VectorAccumulate { + llvm::Instruction *Instr; + // Construction and identification + DxilInst_VectorAccumulate(llvm::Instruction *pInstr) : Instr(pInstr) {} + operator bool() const { + return hlsl::OP::IsDxilOpFuncCallInst(Instr, + hlsl::OP::OpCode::VectorAccumulate); + } + // Validation support + bool isAllowed() const { return true; } + bool isArgumentListValid() const { + if (4 != llvm::dyn_cast(Instr)->getNumArgOperands()) + return false; + return true; + } + // Metadata + bool requiresUniformInputs() const { return false; } + // Operand indexes + enum OperandIdx { + arg_inputVector = 1, + arg_arrayBuffer = 2, + arg_arrayOffset = 3, + }; + // Accessors + llvm::Value *get_inputVector() const { return Instr->getOperand(1); } + void set_inputVector(llvm::Value *val) { Instr->setOperand(1, val); } + llvm::Value *get_arrayBuffer() const { return Instr->getOperand(2); } + void set_arrayBuffer(llvm::Value *val) { Instr->setOperand(2, val); } + llvm::Value *get_arrayOffset() const { return Instr->getOperand(3); } + void set_arrayOffset(llvm::Value *val) { Instr->setOperand(3, val); } +}; // INSTR-HELPER:END } // namespace hlsl diff --git a/include/dxc/DxilContainer/RDAT_LibraryTypes.inl b/include/dxc/DxilContainer/RDAT_LibraryTypes.inl index 4b58b406c2..902f2e9652 100644 --- a/include/dxc/DxilContainer/RDAT_LibraryTypes.inl +++ b/include/dxc/DxilContainer/RDAT_LibraryTypes.inl @@ -565,9 +565,13 @@ RDAT_DXIL_ENUM_START(hlsl::DXIL::ComponentType, uint32_t) RDAT_ENUM_VALUE_NODEF(UNormF64) RDAT_ENUM_VALUE_NODEF(PackedS8x32) RDAT_ENUM_VALUE_NODEF(PackedU8x32) + RDAT_ENUM_VALUE_NODEF(U8) + RDAT_ENUM_VALUE_NODEF(I8) + RDAT_ENUM_VALUE_NODEF(F8_E4M3) + RDAT_ENUM_VALUE_NODEF(F8_E5M2) RDAT_ENUM_VALUE_NODEF(LastEntry) #if DEF_RDAT_ENUMS == DEF_RDAT_DUMP_IMPL - static_assert((unsigned)hlsl::DXIL::ComponentType::LastEntry == 19, + static_assert((unsigned)hlsl::DXIL::ComponentType::LastEntry == 23, "otherwise, RDAT_DXIL_ENUM definition needs updating"); #endif RDAT_ENUM_END() diff --git a/include/dxc/HLSL/HLOperations.h b/include/dxc/HLSL/HLOperations.h index f87d324baf..41def3ba2c 100644 --- a/include/dxc/HLSL/HLOperations.h +++ b/include/dxc/HLSL/HLOperations.h @@ -433,6 +433,54 @@ const unsigned kNodeHandleToResCastOpIdx = 1; const unsigned kAnnotateNodeHandleNodePropIdx = 2; const unsigned kAnnotateNodeRecordHandleNodeRecordPropIdx = 2; +// Linear Algebra Operations + +// MatVecMul +const unsigned kMatVecMulOutputVectorIdx = 1; +const unsigned kMatVecMulIsOutputUnsignedIdx = 2; +const unsigned kMatVecMulInputVectorIdx = 3; +const unsigned kMatVecMulIsInputUnsignedIdx = 4; +const unsigned kMatVecMulInputInterpretationIdx = 5; +const unsigned kMatVecMulMatrixBufferIdx = 6; +const unsigned kMatVecMulMatrixOffsetIdx = 7; +const unsigned kMatVecMulMatrixInterpretationIdx = 8; +const unsigned kMatVecMulMatrixMIdx = 9; +const unsigned kMatVecMulMatrixKIdx = 10; +const unsigned kMatVecMulMatrixLayoutIdx = 11; +const unsigned kMatVecMulMatrixTransposeIdx = 12; +const unsigned kMatVecMulMatrixStrideIdx = 13; + +// MatVecMulAdd +const unsigned kMatVecMulAddOutputVectorIdx = 1; +const unsigned kMatVecMulAddIsOutputUnsignedIdx = 2; +const unsigned kMatVecMulAddInputVectorIdx = 3; +const unsigned kMatVecMulAddIsInputUnsignedIdx = 4; +const unsigned kMatVecMulAddInputInterpretationIdx = 5; +const unsigned kMatVecMulAddMatrixBufferIdx = 6; +const unsigned kMatVecMulAddMatrixOffsetIdx = 7; +const unsigned kMatVecMulAddMatrixInterpretationIdx = 8; +const unsigned kMatVecMulAddMatrixMIdx = 9; +const unsigned kMatVecMulAddMatrixKIdx = 10; +const unsigned kMatVecMulAddMatrixLayoutIdx = 11; +const unsigned kMatVecMulAddMatrixTransposeIdx = 12; +const unsigned kMatVecMulAddMatrixStrideIdx = 13; +const unsigned kMatVecMulAddBiasBufferIdx = 14; +const unsigned kMatVecMulAddBiasOffsetIdx = 15; +const unsigned kMatVecMulAddBiasInterpretationIdx = 16; + +// OuterProductAccumulate +const unsigned kOuterProdAccInputVec1Idx = 1; +const unsigned kOuterProdAccInputVec2Idx = 2; +const unsigned kOuterProdAccMatrixIdx = 3; +const unsigned kOuterProdAccMatrixOffsetIdx = 4; +const unsigned kOuterProdAccMatrixInterpretationIdx = 5; +const unsigned kOuterProdAccMatrixLayoutIdx = 6; +const unsigned kOuterProdAccMatrixStrideIdx = 7; + +// Vector Accumulate +const unsigned kVectorAccInputVecIdx = 1; +const unsigned kVectorAccMatrixIdx = 2; +const unsigned kVectorAccMatrixOffsetIdx = 3; } // namespace HLOperandIndex llvm::Function *GetOrCreateHLFunction(llvm::Module &M, diff --git a/include/dxc/HlslIntrinsicOp.h b/include/dxc/HlslIntrinsicOp.h index d37c27a38e..197bd3e1f5 100644 --- a/include/dxc/HlslIntrinsicOp.h +++ b/include/dxc/HlslIntrinsicOp.h @@ -107,6 +107,10 @@ enum class IntrinsicOp { IOP_WorldToObject = 99, IOP_WorldToObject3x4 = 100, IOP_WorldToObject4x3 = 101, + IOP___builtin_MatVecMul = 390, + IOP___builtin_MatVecMulAdd = 391, + IOP___builtin_OuterProductAccumulate = 392, + IOP___builtin_VectorAccumulate = 393, IOP_abort = 102, IOP_abs = 103, IOP_acos = 104, @@ -396,7 +400,7 @@ enum class IntrinsicOp { IOP_usign = 355, MOP_InterlockedUMax = 356, MOP_InterlockedUMin = 357, - Num_Intrinsics = 390, + Num_Intrinsics = 394, }; inline bool HasUnsignedIntrinsicOpcode(IntrinsicOp opcode) { switch (opcode) { diff --git a/lib/DXIL/DxilOperations.cpp b/lib/DXIL/DxilOperations.cpp index f614ba9d14..95e8dfaeba 100644 --- a/lib/DXIL/DxilOperations.cpp +++ b/lib/DXIL/DxilOperations.cpp @@ -2652,6 +2652,40 @@ const OP::OpCodeProperty OP::m_OpCodeProps[(unsigned)OP::OpCode::NumOpCodes] = { 1, {{0x4e7}}, {{0xe7}}}, // Overloads: hfwidlgetNumParams() <= 1) return nullptr; return FT->getParamType(1); @@ -6291,6 +6382,19 @@ llvm::Type *OP::GetOverloadType(OpCode opCode, llvm::Function *F) { StructType *ST = cast(Ty); return ST->getElementType(0); } + case OpCode::MatVecMul: + case OpCode::MatVecMulAdd: + if (FT->getNumParams() < 2) + return nullptr; + return llvm::StructType::get(Ctx, + {FT->getReturnType(), FT->getParamType(1)}); + + case OpCode::OuterProductAccumulate: + if (FT->getNumParams() < 3) + return nullptr; + return llvm::StructType::get(Ctx, + {FT->getParamType(1), FT->getParamType(2)}); + // OPCODE-OLOAD-TYPES:END default: return Ty; diff --git a/lib/DxilValidation/DxilValidation.cpp b/lib/DxilValidation/DxilValidation.cpp index 00a6b9ae14..0b2ccf5f95 100644 --- a/lib/DxilValidation/DxilValidation.cpp +++ b/lib/DxilValidation/DxilValidation.cpp @@ -970,6 +970,267 @@ static void ValidateImmOperandForMathDxilOp(CallInst *CI, DXIL::OpCode Opcode, } } +static bool CheckLinalgInterpretation(uint32_t Input, bool InRegister) { + using CT = DXIL::ComponentType; + switch (static_cast(Input)) { + case CT::I16: + case CT::U16: + case CT::I32: + case CT::U32: + case CT::F16: + case CT::F32: + case CT::U8: + case CT::I8: + case CT::F8_E4M3: + case CT::F8_E5M2: + return true; + case CT::PackedS8x32: + case CT::PackedU8x32: + return InRegister; + default: + return false; + } +} + +static bool CheckMatrixLayoutForMatVecMulOps(unsigned Layout) { + return Layout <= + static_cast(DXIL::LinalgMatrixLayout::OuterProductOptimal); +} + +std::string GetMatrixLayoutStr(unsigned Layout) { + switch (static_cast(Layout)) { + case DXIL::LinalgMatrixLayout::RowMajor: + return "RowMajor"; + case DXIL::LinalgMatrixLayout::ColumnMajor: + return "ColumnMajor"; + case DXIL::LinalgMatrixLayout::MulOptimal: + return "MulOptimal"; + case DXIL::LinalgMatrixLayout::OuterProductOptimal: + return "OuterProductOptimal"; + default: + DXASSERT_NOMSG(false); + return "Invalid"; + } +} + +static bool CheckTransposeForMatrixLayout(unsigned Layout, bool Transposed) { + switch (static_cast(Layout)) { + case DXIL::LinalgMatrixLayout::RowMajor: + case DXIL::LinalgMatrixLayout::ColumnMajor: + return !Transposed; + + default: + return true; + } +} + +static bool CheckUnsignedFlag(Type *VecTy, bool IsUnsigned) { + Type *ElemTy = VecTy->getScalarType(); + if (ElemTy->isFloatingPointTy()) + return !IsUnsigned; + + return true; +} + +static Value *GetMatVecOpIsOutputUnsigned(CallInst *CI, DXIL::OpCode OpCode) { + switch (OpCode) { + case DXIL::OpCode::MatVecMul: + return CI->getOperand(DXIL::OperandIndex::kMatVecMulIsOutputUnsignedIdx); + case DXIL::OpCode::MatVecMulAdd: + return CI->getOperand(DXIL::OperandIndex::kMatVecMulAddIsOutputUnsignedIdx); + + default: + DXASSERT_NOMSG(false); + return nullptr; + } +} + +static void ValidateImmOperandsForMatVecOps(CallInst *CI, DXIL::OpCode OpCode, + ValidationContext &ValCtx) { + + llvm::Value *IsInputUnsigned = + CI->getOperand(DXIL::OperandIndex::kMatVecMulIsInputUnsignedIdx); + ConstantInt *IsInputUnsignedConst = + dyn_cast(IsInputUnsigned); + if (!IsInputUnsignedConst) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrMatVecOpIsUnsignedFlagsAreConst, + {"IsInputUnsigned"}); + return; + } + + llvm::Value *IsOutputUnsigned = GetMatVecOpIsOutputUnsigned(CI, OpCode); + ConstantInt *IsOutputUnsignedConst = + dyn_cast(IsOutputUnsigned); + if (!IsOutputUnsignedConst) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrMatVecOpIsUnsignedFlagsAreConst, + {"IsOutputUnsigned"}); + return; + } + + llvm::Value *InputInterpretation = + CI->getOperand(DXIL::OperandIndex::kMatVecMulInputInterpretationIdx); + ConstantInt *II = dyn_cast(InputInterpretation); + if (!II) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInterpretationParamAreConst, + {"InputInterpretation"}); + return; + } + uint64_t IIValue = II->getLimitedValue(); + if (!CheckLinalgInterpretation(IIValue, true)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInvalidRegisterInterpValue, + {std::to_string(IIValue), "Input"}); + return; + } + + llvm::Value *MatrixInterpretation = + CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixInterpretationIdx); + ConstantInt *MI = dyn_cast(MatrixInterpretation); + if (!MI) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInterpretationParamAreConst, + {"MatrixInterpretation"}); + return; + } + uint64_t MIValue = MI->getLimitedValue(); + if (!CheckLinalgInterpretation(MIValue, false)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue, + {std::to_string(MIValue), "Matrix"}); + return; + } + + llvm::Value *MatrixM = + CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixMIdx); + if (!llvm::isa(MatrixM)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst, + {"Matrix M dimension"}); + return; + } + + llvm::Value *MatrixK = + CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixKIdx); + if (!llvm::isa(MatrixK)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst, + {"Matrix K dimension"}); + return; + } + + llvm::Value *MatrixLayout = + CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixLayoutIdx); + + ConstantInt *MatrixLayoutConst = dyn_cast(MatrixLayout); + if (!MatrixLayoutConst) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst, + {"Matrix Layout"}); + return; + } + uint64_t MLValue = MatrixLayoutConst->getLimitedValue(); + if (!CheckMatrixLayoutForMatVecMulOps(MLValue)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInvalidMatrixLayoutValueForMatVecOps, + {std::to_string(MLValue), + std::to_string( + static_cast(DXIL::LinalgMatrixLayout::RowMajor)), + std::to_string(static_cast( + DXIL::LinalgMatrixLayout::OuterProductOptimal))}); + return; + } + + llvm::Value *MatrixTranspose = + CI->getOperand(DXIL::OperandIndex::kMatVecMulMatrixTransposeIdx); + ConstantInt *MatrixTransposeConst = dyn_cast(MatrixTranspose); + if (!MatrixTransposeConst) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst, + {"MatrixTranspose"}); + return; + } + + if (!CheckTransposeForMatrixLayout(MLValue, + MatrixTransposeConst->getLimitedValue())) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgMatrixLayoutNotTransposable, + {GetMatrixLayoutStr(MLValue)}); + return; + } + + llvm::Value *InputVector = + CI->getOperand(DXIL::OperandIndex::kMatVecMulInputVectorIdx); + if (!CheckUnsignedFlag(InputVector->getType(), + IsInputUnsignedConst->getLimitedValue())) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgNotAnUnsignedType, {"Input"}); + return; + } + + if (!CheckUnsignedFlag(CI->getType(), + IsOutputUnsignedConst->getLimitedValue())) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgNotAnUnsignedType, {"Output"}); + return; + } + + switch (OpCode) { + case DXIL::OpCode::MatVecMulAdd: { + llvm::Value *BiasInterpretation = + CI->getOperand(DXIL::OperandIndex::kMatVecMulAddBiasInterpretation); + ConstantInt *BI = cast(BiasInterpretation); + if (!BI) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInterpretationParamAreConst, + {"BiasInterpretation"}); + return; + } + uint64_t BIValue = BI->getLimitedValue(); + if (!CheckLinalgInterpretation(BIValue, false)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue, + {std::to_string(BIValue), "Bias vector"}); + return; + } + } break; + default: + break; + } +} + +static void ValidateImmOperandsForOuterProdAcc(CallInst *CI, + ValidationContext &ValCtx) { + + llvm::Value *MatrixInterpretation = + CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixInterpretation); + ConstantInt *MI = cast(MatrixInterpretation); + if (!MI) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInterpretationParamAreConst, + {"MatrixInterpretation"}); + return; + } + uint64_t MIValue = MI->getLimitedValue(); + if (!CheckLinalgInterpretation(MIValue, false)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgInvalidMemoryInterpValue, + {std::to_string(MIValue), "Matrix"}); + return; + } + + llvm::Value *MatrixLayout = + CI->getOperand(DXIL::OperandIndex::kOuterProdAccMatrixLayout); + if (!llvm::isa(MatrixLayout)) { + ValCtx.EmitInstrFormatError( + CI, ValidationRule::InstrLinalgMatrixShapeParamsAreConst, + {"MatrixLayout"}); + return; + } +} + // Validate the type-defined mask compared to the store value mask which // indicates which parts were defined returns true if caller should continue // validation @@ -1994,6 +2255,16 @@ static void ValidateDxilOperationCallInProfile(CallInst *CI, GetLaunchTypeStr(NodeLaunchType)}); break; + case DXIL::OpCode::MatVecMul: + case DXIL::OpCode::MatVecMulAdd: + ValidateImmOperandsForMatVecOps(CI, Opcode, ValCtx); + break; + case DXIL::OpCode::OuterProductAccumulate: + ValidateImmOperandsForOuterProdAcc(CI, ValCtx); + break; + case DXIL::OpCode::VectorAccumulate: + + break; default: // TODO: make sure every Opcode is checked. diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp index b5114fa34b..4f55cb377d 100644 --- a/lib/HLSL/HLOperationLower.cpp +++ b/lib/HLSL/HLOperationLower.cpp @@ -6321,6 +6321,200 @@ Value *TranslateSelect(CallInst *CI, IntrinsicOp IOP, OP::OpCode opcode, return Builder.CreateSelect(cond, t, f); } + +Value *TranslateMatVecMul(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode, + HLOperationLowerHelper &Helper, + HLObjectOperationLowerHelper *ObjHelper, + bool &Translated) { + + hlsl::OP *HlslOp = &Helper.hlslOP; + IRBuilder<> Builder(CI); + + Constant *OpArg = HlslOp->GetU32Const(static_cast(OpCode)); + + // Input parameters + Value *InputVector = + CI->getArgOperand(HLOperandIndex::kMatVecMulInputVectorIdx); + Value *InputIsUnsigned = + CI->getArgOperand(HLOperandIndex::kMatVecMulIsInputUnsignedIdx); + Value *InputInterpretation = + CI->getArgOperand(HLOperandIndex::kMatVecMulInputInterpretationIdx); + + // Matrix parameters + Value *MatrixBuffer = + CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixBufferIdx); + Value *MatrixOffset = + CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixOffsetIdx); + Value *MatrixInterpretation = + CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixInterpretationIdx); + Value *MatrixM = CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixMIdx); + Value *MatrixK = CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixKIdx); + Value *MatrixLayout = + CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixLayoutIdx); + Value *MatrixTranspose = + CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixTransposeIdx); + Value *MatrixStride = + CI->getArgOperand(HLOperandIndex::kMatVecMulMatrixStrideIdx); + + // Output parameters + Value *OutputIsUnsigned = + CI->getArgOperand(HLOperandIndex::kMatVecMulIsOutputUnsignedIdx); + + // Get the DXIL function for the operation + Function *DxilFunc = HlslOp->GetOpFunc( + OpCode, {CI->getArgOperand(HLOperandIndex::kMatVecMulOutputVectorIdx) + ->getType() + ->getPointerElementType(), + InputVector->getType()}); + + // Create a call to the DXIL function + Value *NewCI = Builder.CreateCall( + DxilFunc, + {OpArg, InputVector, InputIsUnsigned, InputInterpretation, MatrixBuffer, + MatrixOffset, MatrixInterpretation, MatrixM, MatrixK, MatrixLayout, + MatrixTranspose, MatrixStride, OutputIsUnsigned}); + + // Get the output parameter and store the result + Value *OutParam = + CI->getArgOperand(HLOperandIndex::kMatVecMulOutputVectorIdx); + + Builder.CreateStore(NewCI, OutParam); + + return nullptr; +} + +Value *TranslateMatVecMulAdd(CallInst *CI, IntrinsicOp IOP, OP::OpCode OpCode, + HLOperationLowerHelper &Helper, + HLObjectOperationLowerHelper *ObjHelper, + bool &Translated) { + + hlsl::OP *HlslOp = &Helper.hlslOP; + IRBuilder<> Builder(CI); + + Constant *OpArg = HlslOp->GetU32Const(static_cast(OpCode)); + + // Input vector parameters + Value *InputVector = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddInputVectorIdx); + Value *InputIsUnsigned = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddIsInputUnsignedIdx); + Value *InputInterpretation = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddInputInterpretationIdx); + + // Matrix parameters + Value *MatrixBuffer = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixBufferIdx); + Value *MatrixOffset = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixOffsetIdx); + Value *MatrixInterpretation = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixInterpretationIdx); + Value *MatrixM = CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixMIdx); + Value *MatrixK = CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixKIdx); + Value *MatrixLayout = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixLayoutIdx); + Value *MatrixTranspose = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixTransposeIdx); + Value *MatrixStride = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddMatrixStrideIdx); + + // Bias parameters + Value *BiasBuffer = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasBufferIdx); + Value *BiasOffset = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasOffsetIdx); + Value *BiasInterpretation = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddBiasInterpretationIdx); + + // Output parameters + Value *OutputIsUnsigned = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddIsOutputUnsignedIdx); + + // Get the DXIL function for the operation + Function *DxilFunc = HlslOp->GetOpFunc( + OpCode, {CI->getArgOperand(HLOperandIndex::kMatVecMulAddOutputVectorIdx) + ->getType() + ->getPointerElementType(), + InputVector->getType()}); + + // Create a call to the DXIL function + Value *NewCI = Builder.CreateCall( + DxilFunc, {OpArg, InputVector, InputIsUnsigned, InputInterpretation, + MatrixBuffer, MatrixOffset, MatrixInterpretation, MatrixM, + MatrixK, MatrixLayout, MatrixTranspose, MatrixStride, + BiasBuffer, BiasOffset, BiasInterpretation, OutputIsUnsigned}); + + // Store the result in the output parameter + Value *OutParam = + CI->getArgOperand(HLOperandIndex::kMatVecMulAddOutputVectorIdx); + Builder.CreateStore(NewCI, OutParam); + + return nullptr; +} + +Value *TranslateOuterProductAccumulate(CallInst *CI, IntrinsicOp IOP, + OP::OpCode OpCode, + HLOperationLowerHelper &Helper, + HLObjectOperationLowerHelper *ObjHelper, + bool &Translated) { + + hlsl::OP *HlslOp = &Helper.hlslOP; + IRBuilder<> Builder(CI); + + Constant *OpArg = HlslOp->GetU32Const(static_cast(OpCode)); + + // Input vector parameters + Value *InputVector1 = + CI->getArgOperand(HLOperandIndex::kOuterProdAccInputVec1Idx); + Value *InputVector2 = + CI->getArgOperand(HLOperandIndex::kOuterProdAccInputVec2Idx); + + // Matrix parameters + Value *MatrixBuffer = + CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixIdx); + Value *MatrixOffset = + CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixOffsetIdx); + Value *MatrixInterpretation = + CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixInterpretationIdx); + Value *MatrixLayout = + CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixLayoutIdx); + Value *MatrixStride = + CI->getArgOperand(HLOperandIndex::kOuterProdAccMatrixStrideIdx); + + // Get the DXIL function for the operation + Function *DxilFunc = HlslOp->GetOpFunc( + OpCode, {InputVector1->getType(), InputVector2->getType()}); + + return Builder.CreateCall( + DxilFunc, {OpArg, InputVector1, InputVector2, MatrixBuffer, MatrixOffset, + MatrixInterpretation, MatrixLayout, MatrixStride}); +} + +Value *TranslateVectorAccumulate(CallInst *CI, IntrinsicOp IOP, + OP::OpCode OpCode, + HLOperationLowerHelper &Helper, + HLObjectOperationLowerHelper *ObjHelper, + bool &Translated) { + + hlsl::OP *HlslOp = &Helper.hlslOP; + IRBuilder<> Builder(CI); + + Constant *OpArg = HlslOp->GetU32Const(static_cast(OpCode)); + + // Input vector parameter + Value *InputVector = CI->getArgOperand(HLOperandIndex::kVectorAccInputVecIdx); + + // Matrix parameters + Value *MatrixBuffer = CI->getArgOperand(HLOperandIndex::kVectorAccMatrixIdx); + Value *MatrixOffset = + CI->getArgOperand(HLOperandIndex::kVectorAccMatrixOffsetIdx); + + // Get the DXIL function for the operation + Function *DxilFunc = HlslOp->GetOpFunc(OpCode, InputVector->getType()); + + return Builder.CreateCall(DxilFunc, + {OpArg, InputVector, MatrixBuffer, MatrixOffset}); +} + } // namespace // Lower table. @@ -7036,6 +7230,15 @@ IntrinsicLower gLowerTable[] = { DXIL::OpCode::HitObject_SetShaderTableIndex}, {IntrinsicOp::MOP_DxHitObject_TraceRay, TranslateHitObjectTraceRay, DXIL::OpCode::HitObject_TraceRay}, + + {IntrinsicOp::IOP___builtin_MatVecMul, TranslateMatVecMul, + DXIL::OpCode::MatVecMul}, + {IntrinsicOp::IOP___builtin_MatVecMulAdd, TranslateMatVecMulAdd, + DXIL::OpCode::MatVecMulAdd}, + {IntrinsicOp::IOP___builtin_OuterProductAccumulate, + TranslateOuterProductAccumulate, DXIL::OpCode::OuterProductAccumulate}, + {IntrinsicOp::IOP___builtin_VectorAccumulate, TranslateVectorAccumulate, + DXIL::OpCode::VectorAccumulate}, }; } // namespace static_assert( diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp index 03a37b6dbc..c6d3c014d9 100644 --- a/tools/clang/lib/Sema/SemaHLSL.cpp +++ b/tools/clang/lib/Sema/SemaHLSL.cpp @@ -12071,6 +12071,18 @@ void Sema::DiagnoseReachableHLSLCall(CallExpr *CE, const hlsl::ShaderModel *SM, break; case hlsl::IntrinsicOp::IOP_DxMaybeReorderThread: DiagnoseReachableSERCall(*this, CE, EntrySK, EntryDecl, true); + break; + case hlsl::IntrinsicOp::IOP___builtin_MatVecMul: + case hlsl::IntrinsicOp::IOP___builtin_MatVecMulAdd: + case hlsl::IntrinsicOp::IOP___builtin_OuterProductAccumulate: + case hlsl::IntrinsicOp::IOP___builtin_VectorAccumulate: + if (!SM->IsSM69Plus()) { + Diags.Report(CE->getExprLoc(), + diag::warn_hlsl_intrinsic_in_wrong_shader_model) + << FD->getNameAsString() << EntryDecl->getNameAsString() << "6.9"; + return; + } + break; default: break; diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl new file mode 100644 index 0000000000..74cb51260c --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/check-shader-stages.hlsl @@ -0,0 +1,135 @@ +// RUN: %dxc -T lib_6_9 %s | FileCheck %s + +ByteAddressBuffer matrix_buffer; +ByteAddressBuffer bias_buffer; +RWByteAddressBuffer rw_matrix_buffer; +ByteAddressBuffer input_vector_buffer; +RWByteAddressBuffer output_vector_buffer; + +void UseCoopVec() { + vector output_vector; + static const uint is_output_unsigned = 0; + + vector input_vector = input_vector_buffer.Load >(0); + const uint is_input_unsigned = 0; + const uint input_interpretation = 9; /*F32*/ + + const uint matrix_offset = 0; + const uint matrix_interpretation = 9; /*F32*/ + const uint matrix_dimM = 4; + const uint matrix_dimK = 4; + const uint matrix_layout = 0; /*RowMajor*/ + const bool matrix_is_transposed = false; + const uint matrix_stride = 64; + + __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, + is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, + matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout, + matrix_is_transposed, matrix_stride); + output_vector_buffer.Store(0, output_vector); + + const uint bias_offset = 0; + const uint bias_interpretation = 9; /*F32*/ + + __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector, + is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, + matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout, + matrix_is_transposed, matrix_stride, bias_buffer, bias_offset, + bias_interpretation); + output_vector_buffer.Store(1024, output_vector); + + vector input_vector1; + vector input_vector2; + const uint opa_matrix_offset = 0; + const uint opa_matrix_interpretation = 5; /*U32*/ + const uint opa_matrix_layout = 3; /*OuterProductOptimal*/ + const uint opa_matrix_stride = 64; + + __builtin_OuterProductAccumulate(input_vector1, input_vector2, + rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation, + opa_matrix_layout, opa_matrix_stride); + + const uint va_matrix_offset = 0; + + __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer, + va_matrix_offset); +} + +// CHECK: define void @ps_main() +// CHECK: call <4 x float> @dx.op.matVecMul +// CHECK: call <4 x float> @dx.op.matVecMulAdd +// CHECK: call void @dx.op.outerProductAccumulate +// CHECK: call void @dx.op.vectorAccumulate + +[Shader("pixel")] +void ps_main() +{ + UseCoopVec(); +} + +// CHECK: define void @cs_main() +// CHECK: call <4 x float> @dx.op.matVecMul +// CHECK: call <4 x float> @dx.op.matVecMulAdd +// CHECK: call void @dx.op.outerProductAccumulate +// CHECK: call void @dx.op.vectorAccumulate + +[Shader("compute")] +[NumThreads(1,1,1)] +void cs_main() +{ + UseCoopVec(); +} + +// CHECK: define void @vs_main() +// CHECK: call <4 x float> @dx.op.matVecMul +// CHECK: call <4 x float> @dx.op.matVecMulAdd +// CHECK: call void @dx.op.outerProductAccumulate +// CHECK: call void @dx.op.vectorAccumulate + +[Shader("vertex")] +void vs_main() +{ + UseCoopVec(); +} + +struct MyRecord{ + uint a; +}; + +// CHECK: define void @ns_main() +// CHECK: call <4 x float> @dx.op.matVecMul +// CHECK: call <4 x float> @dx.op.matVecMulAdd +// CHECK: call void @dx.op.outerProductAccumulate +// CHECK: call void @dx.op.vectorAccumulate + +[Shader("node")] +[NodeLaunch("thread")] +void ns_main(ThreadNodeInputRecord input) +{ + UseCoopVec(); +} + +// Vertex shader output structure +struct VS_OUT { + float3 Color : COLOR0; +}; + +// Geometry shader output structure +struct GS_OUT { + float3 Color : COLOR0; + float2 TexCoord : TEXCOORD0; +}; + +// CHECK: define void @gs_main() +// CHECK: call <4 x float> @dx.op.matVecMul +// CHECK: call <4 x float> @dx.op.matVecMulAdd +// CHECK: call void @dx.op.outerProductAccumulate +// CHECK: call void @dx.op.vectorAccumulate + +[shader("geometry")] +[maxvertexcount(3)] +void gs_main(point VS_OUT input[1], + inout TriangleStream OutputStream) +{ + UseCoopVec(); +} diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl new file mode 100644 index 0000000000..c3b4a3a8d7 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/linalg-builtins.hlsl @@ -0,0 +1,79 @@ +// RUN: %dxc -fcgl -T cs_6_9 -E cs_main %s | FileCheck %s + +ByteAddressBuffer input_vector_buffer; +ByteAddressBuffer opa_input_buffer; +ByteAddressBuffer matrix_buffer; +ByteAddressBuffer bias_buffer; +RWByteAddressBuffer rw_matrix_buffer; +RWByteAddressBuffer output_vector_buffer; + +[Shader("compute")] +[NumThreads(1,1,1)] +void cs_main() +{ + vector output_vector; + static const uint is_output_unsigned = 0; + + vector input_vector = input_vector_buffer.Load >(0); + const uint is_input_unsigned = 0; + const uint input_interpretation = 9; /*F32*/ + + const uint matrix_offset = 0; + const uint matrix_interpretation = 9; /*F32*/ + const uint matrix_dimM = 4; + const uint matrix_dimK = 4; + const uint matrix_layout = 0; /*RowMajor*/ + const bool matrix_is_transposed = false; + const uint matrix_stride = 64; + + // CHECK: %[[MLD0:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A" + // CHECK: %[[MCH0:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[MLD0]]) + // CHECK: %[[MAH0:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH0]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef) + // CHECK: call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x float>* %{{[^ ]+}}, i1 false, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH0]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64) + __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, + is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, + matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout, + matrix_is_transposed, matrix_stride); + output_vector_buffer.Store(0, output_vector); + + const uint bias_offset = 0; + const uint bias_interpretation = 9; /*F32*/ + + // CHECK: %[[MLD1:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A" + // CHECK: %[[MCH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[MLD1]]) + // CHECK: %[[MAH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH1]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef) + // CHECK-NEXT: %[[BLD1:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A" + // CHECK-NEXT: %[[BCH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %[[BLD1]]) + // CHECK-NEXT: %[[BAH1:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %[[BCH1]], %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer undef) + // CHECK-NEXT: call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x float>* %{{[^ ]+}}, i1 false, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH1]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %[[BAH1]], i32 0, i32 9) + __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector, + is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, + matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout, + matrix_is_transposed, matrix_stride, bias_buffer, bias_offset, + bias_interpretation); + output_vector_buffer.Store(1024, output_vector); + + vector input_vector1 = opa_input_buffer.Load >(0); + vector input_vector2 = opa_input_buffer.Load >(128); + const uint opa_matrix_offset = 0; + const uint opa_matrix_interpretation = 5; /*U32*/ + const uint opa_matrix_layout = 3; /*OuterProductOptimal*/ + const uint opa_matrix_stride = 64; + + // CHECK: %[[MLD2:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A" + // CHECK: %[[MCH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %[[MLD2]]) + // CHECK: %[[MAH2:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH2]], %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef) + // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH2]], i32 0, i32 5, i32 3, i32 64) + __builtin_OuterProductAccumulate(input_vector1, input_vector2, + rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation, + opa_matrix_layout, opa_matrix_stride); + + const uint va_matrix_offset = 0; + + // CHECK: %[[MLD3:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A" + // CHECK: %[[MCH3:[^ ]+]] = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %[[MLD3]]) + // CHECK: %[[MAH3:[^ ]+]] = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %[[MCH3]], %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer undef) + // CHECK: call void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32 393, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[MAH3]], i32 0) + __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer, + va_matrix_offset); +} \ No newline at end of file diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg new file mode 100644 index 0000000000..c2417a9e43 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/lit.local.cfg @@ -0,0 +1 @@ +config.unsupported = 'dxil-1-9' not in config.available_features \ No newline at end of file diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl new file mode 100644 index 0000000000..98a568fa22 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul-add_multioverload.hlsl @@ -0,0 +1,108 @@ +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-0 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-1 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 | FileCheck %s --check-prefixes COMMON,DXIL-2 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 | FileCheck %s --check-prefixes COMMON,DXIL-3 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 | FileCheck %s --check-prefixes COMMON,DXIL-4 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-5 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-6 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 | FileCheck %s --check-prefixes COMMON,DXIL-7 + +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -DBI=F16 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -DBI=I32 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -DBI=I32 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -DBI=I8 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7 + + +// COMMON: define void @main() + +// Test minimum support set of combinations for matVecMul +// HLOP-0: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8) +// DXIL-0: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) +// HLOP-1: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8) +// DXIL-1: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) +// HLOP-2: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8) +// DXIL-2: call <4 x half> @dx.op.matVecMulAdd.v4f16.v8f16(i32 306, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i1 false) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) +// HLOP-3: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4) +// DXIL-3: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) +// HLOP-4: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 4) +// DXIL-4: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 4, i1 false) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) + +// Test unsigned variations +// HLOP-5: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20) +// DXIL-5: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8f32(i32 306, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 true) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) +// HLOP-6: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20) +// DXIL-6: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) +// HLOP-7: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20) +// DXIL-7: call <4 x i32> @dx.op.matVecMulAdd.v4i32.v8i32(i32 306, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i1 false) ; MatVecMulAdd(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,biasBuffer,biasOffset,biasIntepretation,isOutputUnsigned) + + +ByteAddressBuffer input_vector_buffer; +ByteAddressBuffer matrix_buffer; +ByteAddressBuffer bias_buffer; +RWByteAddressBuffer rw_matrix_buffer; +RWByteAddressBuffer output_vector_buffer; + +enum CompType { + Invalid = 0, + I1 = 1, + I16 = 2, + U16 = 3, + I32 = 4, + U32 = 5, + I64 = 6, + U64 = 7, + F16 = 8, + F32 = 9, + F64 = 10, + SNormF16 = 11, + UNormF16 = 12, + SNormF32 = 13, + UNormF32 = 14, + SNormF64 = 15, + UNormF64 = 16, + PackedS8x32 = 17, + PackedU8x32 = 18, + + // BEGIN NEW FOR SM 6.9 + U8 = 19, + I8 = 20, + F8_E4M3 = 21, + F8_E5M2 = 22, +}; + +enum MatLayout { + RowMajor = 0, + ColumnMajor = 1, + MulOptimal = 2, + OuterProductOptimal = 3, +}; + +[NumThreads(1,1,1)] +void main() +{ + vector output_vector; + static const uint is_output_unsigned = OU; + + vector input_vector = input_vector_buffer.Load >(0); + const uint is_input_unsigned = IU; + const uint input_interpretation = II; + + const uint matrix_offset = 0; + const uint matrix_interpretation = MI; + const uint matrix_dimM = 8; + const uint matrix_dimK = 8; + const uint matrix_layout = ML; + const bool matrix_is_transposed = (bool) MT; + const uint matrix_stride = 64; + + const uint bias_offset = 0; + const uint bias_interpretation = BI; + + __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector, is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, matrix_interpretation, + matrix_dimM, matrix_dimK, matrix_layout, matrix_is_transposed, matrix_stride, bias_buffer, bias_offset, bias_interpretation); + output_vector_buffer.Store(0, output_vector); +} diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl new file mode 100644 index 0000000000..2ca2648503 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/mat-vec-mul_multioverload.hlsl @@ -0,0 +1,104 @@ +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-0 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-1 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-2 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-3 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-4 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-5 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 | FileCheck %s --check-prefixes COMMON,DXIL-6 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 | FileCheck %s --check-prefixes COMMON,DXIL-7 + +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F16 -DMI=F16 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E4M3 -DMI=F8_E4M3 -DML=MulOptimal -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=float16_t -DIU=0 -DITY=float16_t -DII=F8_E5M2 -DMI=F8_E5M2 -DML=MulOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=uint -DII=PackedS8x32 -DMI=I8 -DML=OuterProductOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-3 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=float -DII=I8 -DMI=I8 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-4 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=1 -DOTY=uint -DIU=0 -DITY=float -DII=I8 -DMI=F16 -DML=RowMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-5 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=1 -DITY=uint -DII=U8 -DMI=I8 -DML=ColumnMajor -DMT=0 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-6 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DOU=0 -DOTY=int -DIU=0 -DITY=int -DII=U8 -DMI=U8 -DML=MulOptimal -DMT=1 -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-7 + +// COMMON: define void @main() + +// Test minimum support set of combinations for matVecMul +// HLOP-0: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 8, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64) +// DXIL-0: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 8, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) +// HLOP-1: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 21, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64) +// DXIL-1: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 21, %dx.types.Handle {{[^ ]+}}, i32 0, i32 21, i32 8, i32 8, i32 2, i1 false, i32 64, i1 false) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) +// HLOP-2: call void @"dx.hl.op..void (i32, <4 x half>*, i1, <8 x half>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x half>* %output_vector, i1 false, <8 x half> %{{[^ ]+}}, i1 false, i32 22, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64) +// DXIL-2: call <4 x half> @dx.op.matVecMul.v4f16.v8f16(i32 305, <8 x half> {{[^ ]+}}, i1 false, i32 22, %dx.types.Handle {{[^ ]+}}, i32 0, i32 22, i32 8, i32 8, i32 2, i1 true, i32 64, i1 false) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) +// HLOP-3: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 17, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64) +// DXIL-3: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 17, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 3, i1 true, i32 64, i1 false) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) +// HLOP-4: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64) +// DXIL-4: call <4 x i32> @dx.op.matVecMul.v4i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 0, i1 false, i32 64, i1 false) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) + +// Test unsigned variations +// HLOP-5: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 true, <8 x float> %{{[^ ]+}}, i1 false, i32 20, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64) +// DXIL-5: call <4 x i32> @dx.op.matVecMul.v4i32.v8f32(i32 305, <8 x float> {{[^ ]+}}, i1 false, i32 20, %dx.types.Handle {{[^ ]+}}, i32 0, i32 8, i32 8, i32 8, i32 0, i1 false, i32 64, i1 true) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) +// HLOP-6: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 true, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64) +// DXIL-6: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 true, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 20, i32 8, i32 8, i32 1, i1 false, i32 64, i1 false) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) +// HLOP-7: call void @"dx.hl.op..void (i32, <4 x i32>*, i1, <8 x i32>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x i32>* %output_vector, i1 false, <8 x i32> %{{[^ ]+}}, i1 false, i32 19, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64) +// DXIL-7: call <4 x i32> @dx.op.matVecMul.v4i32.v8i32(i32 305, <8 x i32> {{[^ ]+}}, i1 false, i32 19, %dx.types.Handle {{[^ ]+}}, i32 0, i32 19, i32 8, i32 8, i32 2, i1 true, i32 64, i1 false) ; MatVecMul(inputVector,isInputUnsigned,inputInterpretation,matrixBuffer,matrixOffset,matrixIntepretation,matrixM,matrixK,matrixLayout,matrixTranspose,matrixStride,isOutputUnsigned) + + +ByteAddressBuffer input_vector_buffer; +ByteAddressBuffer matrix_buffer; +ByteAddressBuffer bias_buffer; +RWByteAddressBuffer rw_matrix_buffer; +RWByteAddressBuffer output_vector_buffer; + +enum CompType { + Invalid = 0, + I1 = 1, + I16 = 2, + U16 = 3, + I32 = 4, + U32 = 5, + I64 = 6, + U64 = 7, + F16 = 8, + F32 = 9, + F64 = 10, + SNormF16 = 11, + UNormF16 = 12, + SNormF32 = 13, + UNormF32 = 14, + SNormF64 = 15, + UNormF64 = 16, + PackedS8x32 = 17, + PackedU8x32 = 18, + + // BEGIN NEW FOR SM 6.9 + U8 = 19, + I8 = 20, + F8_E4M3 = 21, + F8_E5M2 = 22, +}; + +enum MatLayout { + RowMajor = 0, + ColumnMajor = 1, + MulOptimal = 2, + OuterProductOptimal = 3, +}; + +[NumThreads(1,1,1)] +void main() +{ + vector output_vector; + static const uint is_output_unsigned = OU; + + vector input_vector = input_vector_buffer.Load >(0); + const uint is_input_unsigned = IU; + const uint input_interpretation = II; + + const uint matrix_offset = 0; + const uint matrix_interpretation = MI; + const uint matrix_dimM = 8; + const uint matrix_dimK = 8; + const uint matrix_layout = ML; + const bool matrix_is_transposed = (bool) MT; + const uint matrix_stride = 64; + + __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, matrix_interpretation, + matrix_dimM, matrix_dimK, matrix_layout, matrix_is_transposed, matrix_stride); + output_vector_buffer.Store(0, output_vector); +} diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl new file mode 100644 index 0000000000..40bbe62284 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/outer-product-accumulate-multioverload.hlsl @@ -0,0 +1,70 @@ +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor | FileCheck %s --check-prefixes COMMON,DXIL-0 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-1 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-2 + +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1 +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2 + +ByteAddressBuffer input_vector_buffer; +ByteAddressBuffer input_vector_buffer2; +RWByteAddressBuffer matrix_buffer; + +// COMMON: define void @main() +// DXIL-0: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64) ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride) +// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64) +// DXIL-1: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64) ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride) +// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64) +// DXIL-2: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64) ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride) +// HLOP-2: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64) + +enum CompType { + Invalid = 0, + I1 = 1, + I16 = 2, + U16 = 3, + I32 = 4, + U32 = 5, + I64 = 6, + U64 = 7, + F16 = 8, + F32 = 9, + F64 = 10, + SNormF16 = 11, + UNormF16 = 12, + SNormF32 = 13, + UNormF32 = 14, + SNormF64 = 15, + UNormF64 = 16, + PackedS8x32 = 17, + PackedU8x32 = 18, + + // BEGIN NEW FOR SM 6.9 + U8 = 19, + I8 = 20, + F8_E4M3 = 21, + F8_E5M2 = 22, +}; + +enum MatLayout { + RowMajor = 0, + ColumnMajor = 1, + MulOptimal = 2, + OuterProductOptimal = 3, +}; + + +[Numthreads(1,1,1)] +void main() +{ + vector input_vector1 = input_vector_buffer.Load >(0); + vector input_vector2 = input_vector_buffer2.Load >(0); + + const uint matrix_interpretation = MI; + const uint matrix_layout = ML; + const uint matrix_offset = 0; + const uint matrix_stride = 64; + + __builtin_OuterProductAccumulate(input_vector1, input_vector2, matrix_buffer, matrix_offset, matrix_interpretation, matrix_layout, matrix_stride); + +} diff --git a/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl new file mode 100644 index 0000000000..dc1bb6c563 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/intrinsics/linalg_builtins/vector-accumulate.hlsl @@ -0,0 +1,16 @@ +// RUN: %dxc -T cs_6_9 %s | FileCheck %s + +RWByteAddressBuffer matrix_buffer; + +// Test use of __builtin_VectorAccumulate in compute shader +// CHECK: define void @main() +// CHECK: call void @dx.op.vectorAccumulate.v2i32(i32 {{[0-9]+}}, <2 x i32> , %dx.types.Handle {{%[0-9]+}}, i32 0) + +[NumThreads(1,1,1)] +void main() +{ + vector input_vector1 = 5; + const uint matrix_offset = 0; + + __builtin_VectorAccumulate(input_vector1, matrix_buffer, matrix_offset); +} diff --git a/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll new file mode 100644 index 0000000000..6623f63031 --- /dev/null +++ b/tools/clang/test/DXC/Passes/DxilGen/linalg-builtins.ll @@ -0,0 +1,189 @@ +; RUN: %dxopt %s -hlsl-passes-resume -dxilgen -S | FileCheck %s +; REQUIRES: dxil-1-9 + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +%struct.ByteAddressBuffer = type { i32 } +%struct.RWByteAddressBuffer = type { i32 } +%dx.types.Handle = type { i8* } +%dx.types.ResourceProperties = type { i32, i32 } + +@"\01?input_vector_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4 +@"\01?opa_input_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4 +@"\01?matrix_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4 +@"\01?bias_buffer@@3UByteAddressBuffer@@A" = external global %struct.ByteAddressBuffer, align 4 +@"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4 +@"\01?output_vector_buffer@@3URWByteAddressBuffer@@A" = external global %struct.RWByteAddressBuffer, align 4 + +; Function Attrs: nounwind +define void @cs_main() #0 { +entry: + ;CHECK-DAG: %[[MLD:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A" + ;CHECK-DAG: %[[BLD:[^ ]+]] = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A" + ;CHECK-DAG: %[[RWMLD0:[^ ]+]] = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A" + %output_vector = alloca <4 x float>, align 4 + %tmp = bitcast <4 x float>* %output_vector to i8*, !dbg !21 ; line:14 col:5 + call void @llvm.lifetime.start(i64 16, i8* %tmp) #0, !dbg !21 ; line:14 col:5 + %tmp1 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?input_vector_buffer@@3UByteAddressBuffer@@A", !dbg !25 ; line:17 col:37 + %tmp2 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp1), !dbg !25 ; line:17 col:37 + %tmp3 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp2, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !25 ; line:17 col:37 + %tmp4 = call <4 x float> @"dx.hl.op.ro.<4 x float> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp3, i32 0), !dbg !25 ; line:17 col:37 + %tmp5 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !dbg !26 ; line:33 col:5 + %tmp6 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp5), !dbg !26 ; line:33 col:5 + %tmp7 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp6, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !26 ; line:33 col:5 + + ;CHECK: %[[MCH0:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[MLD]] + ;CHECK: %[[MAH0:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[MCH0]] + ;CHECK: call <4 x float> @dx.op.matVecMul.v4f32.v4f32(i32 305, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH0]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, i1 false) + call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32 390, <4 x float>* %output_vector, i1 false, <4 x float> %tmp4, i1 false, i32 9, %dx.types.Handle %tmp7, i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64), !dbg !26 ; line:33 col:5 + + %tmp8 = load <4 x float>, <4 x float>* %output_vector, align 4, !dbg !27, !tbaa !28 ; line:37 col:35 + %tmp9 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !dbg !31 ; line:37 col:5 + %tmp10 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp9), !dbg !31 ; line:37 col:5 + %tmp11 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp10, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !31 ; line:37 col:5 + call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32 277, %dx.types.Handle %tmp11, i32 0, <4 x float> %tmp8), !dbg !31 ; line:37 col:5 + %tmp12 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !dbg !32 ; line:49 col:5 + %tmp13 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp12), !dbg !32 ; line:49 col:5 + %tmp14 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp13, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !32 ; line:49 col:5 + %tmp15 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A", !dbg !32 ; line:49 col:5 + %tmp16 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp15), !dbg !32 ; line:49 col:5 + %tmp17 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp16, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !32 ; line:49 col:5 + + ;CHECK: %[[MCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[MLD]] + ;CHECK: %[[MAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[MCH1]] + ;CHECK: %[[BCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.ByteAddressBuffer(i32 160, %struct.ByteAddressBuffer %[[BLD]] + ;CHECK: %[[BAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[BCH1]] + ;CHECK: call <4 x float> @dx.op.matVecMulAdd.v4f32.v4f32(i32 306, <4 x float> %{{[^ ]+}}, i1 false, i32 9, %dx.types.Handle %[[MAH1]], i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %[[BAH1]], i32 0, i32 9, i1 false) + call void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32 391, <4 x float>* %output_vector, i1 false, <4 x float> %tmp4, i1 false, i32 9, %dx.types.Handle %tmp14, i32 0, i32 9, i32 4, i32 4, i32 0, i1 false, i32 64, %dx.types.Handle %tmp17, i32 0, i32 9), !dbg !32 ; line:49 col:5 + + %tmp18 = load <4 x float>, <4 x float>* %output_vector, align 4, !dbg !33, !tbaa !28 ; line:54 col:38 + %tmp19 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !dbg !34 ; line:54 col:5 + %tmp20 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp19), !dbg !34 ; line:54 col:5 + %tmp21 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp20, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !34 ; line:54 col:5 + call void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32 277, %dx.types.Handle %tmp21, i32 1024, <4 x float> %tmp18), !dbg !34 ; line:54 col:5 + %tmp22 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !dbg !35 ; line:56 col:37 + %tmp23 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp22), !dbg !35 ; line:56 col:37 + %tmp24 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp23, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !35 ; line:56 col:37 + %tmp25 = call <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp24, i32 0), !dbg !35 ; line:56 col:37 + %tmp26 = load %struct.ByteAddressBuffer, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !dbg !36 ; line:57 col:37 + %tmp27 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32 0, %struct.ByteAddressBuffer %tmp26), !dbg !36 ; line:57 col:37 + %tmp28 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp27, %dx.types.ResourceProperties { i32 11, i32 0 }, %struct.ByteAddressBuffer zeroinitializer), !dbg !36 ; line:57 col:37 + %tmp29 = call <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32 231, %dx.types.Handle %tmp28, i32 128), !dbg !36 ; line:57 col:37 + %tmp30 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !dbg !37 ; line:67 col:5 + %tmp31 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp30), !dbg !37 ; line:67 col:5 + %tmp32 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp31, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !37 ; line:67 col:5 + + ;CHECK: %[[RWMCH0:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.RWByteAddressBuffer(i32 160, %struct.RWByteAddressBuffer %[[RWMLD0]] + ;CHECK: %[[RWMAH0:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RWMCH0]] + ;CHECK: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH0]], i32 0, i32 5, i32 3, i32 64) + call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 392, <8 x i32> %tmp25, <8 x i32> %tmp29, %dx.types.Handle %tmp32, i32 0, i32 5, i32 3, i32 64), !dbg !37 ; line:67 col:5 + + + %tmp33 = load %struct.RWByteAddressBuffer, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !dbg !38 ; line:77 col:5 + %tmp34 = call %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32 0, %struct.RWByteAddressBuffer %tmp33), !dbg !38 ; line:77 col:5 + %tmp35 = call %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32 14, %dx.types.Handle %tmp34, %dx.types.ResourceProperties { i32 4107, i32 0 }, %struct.RWByteAddressBuffer zeroinitializer), !dbg !38 ; line:77 col:5 + + ;CHECK: %[[RWMCH1:[^ ]+]] = call %dx.types.Handle @dx.op.createHandleForLib.struct.RWByteAddressBuffer(i32 160, %struct.RWByteAddressBuffer %[[RWMLD0]] + ;CHECK: %[[RWMAH1:[^ ]+]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %[[RWMCH1]] + ;CHECK: call void @dx.op.vectorAccumulate.v8i32(i32 308, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %[[RWMAH1]], i32 0) + call void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32 393, <8 x i32> %tmp25, %dx.types.Handle %tmp35, i32 0), !dbg !38 ; line:77 col:5 + + %tmp36 = bitcast <4 x float>* %output_vector to i8*, !dbg !39 ; line:79 col:1 + call void @llvm.lifetime.end(i64 16, i8* %tmp36) #0, !dbg !39 ; line:79 col:1 + ret void, !dbg !39 ; line:79 col:1 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +; Function Attrs: nounwind readonly +declare <4 x float> @"dx.hl.op.ro.<4 x float> (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1 + +; Function Attrs: nounwind readnone +declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.ByteAddressBuffer)"(i32, %struct.ByteAddressBuffer) #2 + +; Function Attrs: nounwind readnone +declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.ByteAddressBuffer) #2 + +; Function Attrs: nounwind +declare void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32)"(i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32) #0 + +; Function Attrs: nounwind +declare void @"dx.hl.op..void (i32, %dx.types.Handle, i32, <4 x float>)"(i32, %dx.types.Handle, i32, <4 x float>) #0 + +; Function Attrs: nounwind readnone +declare %dx.types.Handle @"dx.hl.createhandle..%dx.types.Handle (i32, %struct.RWByteAddressBuffer)"(i32, %struct.RWByteAddressBuffer) #2 + +; Function Attrs: nounwind readnone +declare %dx.types.Handle @"dx.hl.annotatehandle..%dx.types.Handle (i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer)"(i32, %dx.types.Handle, %dx.types.ResourceProperties, %struct.RWByteAddressBuffer) #2 + +; Function Attrs: nounwind +declare void @"dx.hl.op..void (i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32)"(i32, <4 x float>*, i1, <4 x float>, i1, i32, %dx.types.Handle, i32, i32, i32, i32, i32, i1, i32, %dx.types.Handle, i32, i32) #0 + +; Function Attrs: nounwind readonly +declare <8 x i32> @"dx.hl.op.ro.<8 x i32> (i32, %dx.types.Handle, i32)"(i32, %dx.types.Handle, i32) #1 + +; Function Attrs: nounwind +declare void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32) #0 + +; Function Attrs: nounwind +declare void @"dx.hl.op..void (i32, <8 x i32>, %dx.types.Handle, i32)"(i32, <8 x i32>, %dx.types.Handle, i32) #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } +attributes #2 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!pauseresume = !{!1} +!dx.version = !{!2} +!dx.valver = !{!2} +!dx.shaderModel = !{!3} +!dx.typeAnnotations = !{!4} +!dx.entryPoints = !{!8} +!dx.fnprops = !{!18} +!dx.options = !{!19, !20} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"} +!2 = !{i32 1, i32 9} +!3 = !{!"cs", i32 6, i32 9} +!4 = !{i32 1, void ()* @cs_main, !5} +!5 = !{!6} +!6 = !{i32 1, !7, !7} +!7 = !{} +!8 = !{void ()* @cs_main, !"cs_main", null, !9, null} +!9 = !{!10, !15, null, null} +!10 = !{!11, !12, !13, !14} +!11 = !{i32 0, %struct.ByteAddressBuffer* @"\01?input_vector_buffer@@3UByteAddressBuffer@@A", !"input_vector_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null} +!12 = !{i32 1, %struct.ByteAddressBuffer* @"\01?opa_input_buffer@@3UByteAddressBuffer@@A", !"opa_input_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null} +!13 = !{i32 2, %struct.ByteAddressBuffer* @"\01?matrix_buffer@@3UByteAddressBuffer@@A", !"matrix_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null} +!14 = !{i32 3, %struct.ByteAddressBuffer* @"\01?bias_buffer@@3UByteAddressBuffer@@A", !"bias_buffer", i32 -1, i32 -1, i32 1, i32 11, i32 0, null} +!15 = !{!16, !17} +!16 = !{i32 0, %struct.RWByteAddressBuffer* @"\01?rw_matrix_buffer@@3URWByteAddressBuffer@@A", !"rw_matrix_buffer", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null} +!17 = !{i32 1, %struct.RWByteAddressBuffer* @"\01?output_vector_buffer@@3URWByteAddressBuffer@@A", !"output_vector_buffer", i32 -1, i32 -1, i32 1, i32 11, i1 false, i1 false, i1 false, null} +!18 = !{void ()* @cs_main, i32 5, i32 1, i32 1, i32 1} +!19 = !{i32 -2147483584} +!20 = !{i32 -1} +!21 = !DILocation(line: 14, column: 5, scope: !22) +!22 = !DISubprogram(name: "cs_main", scope: !23, file: !23, line: 12, type: !24, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, function: void ()* @cs_main) +!23 = !DIFile(filename: "DirectXShaderCompiler\5Ctools\5Cclang\5Ctest\5CCodeGenDXIL\5Chlsl\5Cintrinsics\5Clinalg_builtins\5Clinalg-builtins.hlsl", directory: "") +!24 = !DISubroutineType(types: !7) +!25 = !DILocation(line: 17, column: 37, scope: !22) +!26 = !DILocation(line: 33, column: 5, scope: !22) +!27 = !DILocation(line: 37, column: 35, scope: !22) +!28 = !{!29, !29, i64 0} +!29 = !{!"omnipotent char", !30, i64 0} +!30 = !{!"Simple C/C++ TBAA"} +!31 = !DILocation(line: 37, column: 5, scope: !22) +!32 = !DILocation(line: 49, column: 5, scope: !22) +!33 = !DILocation(line: 54, column: 38, scope: !22) +!34 = !DILocation(line: 54, column: 5, scope: !22) +!35 = !DILocation(line: 56, column: 37, scope: !22) +!36 = !DILocation(line: 57, column: 37, scope: !22) +!37 = !DILocation(line: 67, column: 5, scope: !22) +!38 = !DILocation(line: 77, column: 5, scope: !22) +!39 = !DILocation(line: 79, column: 1, scope: !22) diff --git a/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl new file mode 100644 index 0000000000..d5e251ae8b --- /dev/null +++ b/tools/clang/test/SemaHLSL/hlsl/linalg/unavailable-pre-sm69.hlsl @@ -0,0 +1,59 @@ +// RUN: %dxc -T lib_6_8 %s -verify + +ByteAddressBuffer matrix_buffer; +ByteAddressBuffer bias_buffer; +RWByteAddressBuffer rw_matrix_buffer; + +[Shader("compute")] +[Numthreads(1,1,1)] +void cs_main() +{ + vector output_vector; + static const uint is_output_unsigned = 0; + + vector input_vector; + const uint is_input_unsigned = 0; + const uint input_interpretation = 9; /*F32*/ + + const uint matrix_offset = 0; + const uint matrix_interpretation = 9; /*F32*/ + const uint matrix_dimM = 4; + const uint matrix_dimK = 4; + const uint matrix_layout = 0; /*RowMajor*/ + const bool matrix_is_transposed = false; + const uint matrix_stride = 64; + + //expected-error@+1{{intrinsic __builtin_MatVecMul potentially used by 'cs_main' requires shader model 6.9 or greater}} + __builtin_MatVecMul(output_vector, is_output_unsigned, input_vector, + is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, + matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout, + matrix_is_transposed, matrix_stride); + + const uint bias_offset = 0; + const uint bias_interpretation = 9; /*F32*/ + + //expected-error@+1{{intrinsic __builtin_MatVecMulAdd potentially used by 'cs_main' requires shader model 6.9 or greater}} + __builtin_MatVecMulAdd(output_vector, is_output_unsigned, input_vector, + is_input_unsigned, input_interpretation, matrix_buffer, matrix_offset, + matrix_interpretation, matrix_dimM, matrix_dimK, matrix_layout, + matrix_is_transposed, matrix_stride, bias_buffer, bias_offset, + bias_interpretation); + + vector input_vector1; + vector input_vector2; + const uint opa_matrix_offset = 0; + const uint opa_matrix_interpretation = 5; /*U32*/ + const uint opa_matrix_layout = 3; /*OuterProductOptimal*/ + const uint opa_matrix_stride = 64; + + //expected-error@+1{{intrinsic __builtin_OuterProductAccumulate potentially used by 'cs_main' requires shader model 6.9 or greater}} + __builtin_OuterProductAccumulate(input_vector1, input_vector2, + rw_matrix_buffer, opa_matrix_offset, opa_matrix_interpretation, + opa_matrix_layout, opa_matrix_stride); + + const uint va_matrix_offset = 0; + + //expected-error@+1{{intrinsic __builtin_VectorAccumulate potentially used by 'cs_main' requires shader model 6.9 or greater}} + __builtin_VectorAccumulate(input_vector1, rw_matrix_buffer, + va_matrix_offset); +} \ No newline at end of file diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt index f1274fd308..c394611302 100644 --- a/utils/hct/gen_intrin_main.txt +++ b/utils/hct/gen_intrin_main.txt @@ -383,6 +383,14 @@ void [[]] Barrier(in NodeRecordOrUAV o, in uint SemanticFlags); uint [[]] GetRemainingRecursionLevels(); +void [[]] __builtin_MatVecMul(out numeric OutputVector, in bool OutputIsUnsigned, in numeric InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride); + +void [[]] __builtin_MatVecMulAdd(out numeric OutputVector, in bool OutputIsUnsigned, in numeric InputVector, in bool InputIsUnsigned, in uint InputInterpretation, in ByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint M, in uint K, in uint MatrixLayout, in bool MatrixIsTransposed, in uint MatrixStride, in ByteAddressBuffer BiasVector, in uint BiasOffset, in uint BiasInterpretation); + +void [[]] __builtin_OuterProductAccumulate(in numeric InputVector1, in numeric InputVector2, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset, in uint MatrixInterpretation, in uint MatrixLayout, in uint MatrixStride); + +void [[]] __builtin_VectorAccumulate(in numeric InputVector, in RWByteAddressBuffer MatrixBuffer, in uint MatrixOffset); + } namespace diff --git a/utils/hct/hctdb.py b/utils/hct/hctdb.py index 6344fb5849..63af8c0b38 100644 --- a/utils/hct/hctdb.py +++ b/utils/hct/hctdb.py @@ -873,6 +873,11 @@ def populate_categories_and_models(self): "library", "raygeneration", ) + for i in ( + "MatVecMul,MatVecMulAdd,OuterProductAccumulate,VectorAccumulate" + ).split(","): + self.name_idx[i].category = "Linear Algebra Operations" + self.name_idx[i].shader_model = 6, 9 def populate_llvm_instructions(self): # Add instructions that map to LLVM instructions. @@ -6340,6 +6345,103 @@ def UFI(name, **mappings): ) next_op_idx += 1 + self.add_dxil_op( + "MatVecMul", + next_op_idx, + "MatVecMul", + "Multiplies a MxK dimension matrix and a K sized input vector", + "