Skip to content

Commit c3ffca6

Browse files
authored
Respect matrix orientation when doing store op in RWByteAddressBuffer (#3484)
1 parent b89f065 commit c3ffca6

4 files changed

Lines changed: 94 additions & 17 deletions

File tree

tools/clang/lib/CodeGen/CGCall.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3612,6 +3612,17 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
36123612

36133613
llvm::CallSite CS;
36143614
if (!InvokeDest) {
3615+
// HLSL changes begin
3616+
// When storing a matrix to memory, make sure to change its orientation to match in-memory
3617+
// orientation.
3618+
if (getLangOpts().HLSL && CGM.getHLSLRuntime().NeedHLSLMartrixCastForStoreOp(TargetDecl, IRCallArgs)) {
3619+
llvm::SmallVector<clang::QualType, 16> tyList;
3620+
for (CallArgList::const_iterator I = CallArgs.begin(), E = CallArgs.end(); I != E; ++I) {
3621+
tyList.emplace_back(I->Ty);
3622+
}
3623+
CGM.getHLSLRuntime().EmitHLSLMartrixCastForStoreOp(*this, IRCallArgs, tyList);
3624+
}
3625+
// HLSL changes end
36153626
CS = Builder.CreateCall(Callee, IRCallArgs);
36163627
} else {
36173628
llvm::BasicBlock *Cont = createBasicBlock("invoke.cont");

tools/clang/lib/CodeGen/CGHLSLMS.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,11 @@ class CGMSHLSLRuntime : public CGHLSLRuntime {
302302
void MarkLoopStmt(CodeGenFunction &CGF, BasicBlock *loopContinue,
303303
BasicBlock *loopExit) override;
304304
void MarkScopeEnd(CodeGenFunction &CGF) override;
305+
bool NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
306+
llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) override;
307+
void EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
308+
SmallVector<llvm::Value*, 16>& IRCallArgs,
309+
llvm::SmallVector<clang::QualType, 16>& ArgTys) override;
305310
/// Get or add constant to the program
306311
HLCBuffer &GetOrCreateCBuffer(HLSLBufferDecl *D);
307312
};
@@ -4947,6 +4952,60 @@ void CGMSHLSLRuntime::EmitHLSLMatrixStore(CGBuilderTy &Builder, Value *Val,
49474952
Val->getType(), {DestPtr, Val}, TheModule);
49484953
}
49494954

4955+
bool CGMSHLSLRuntime::NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
4956+
llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) {
4957+
4958+
const clang::FunctionDecl* FD = dyn_cast<clang::FunctionDecl>(TD);
4959+
4960+
unsigned opcode = 0;
4961+
StringRef group;
4962+
if (!hlsl::GetIntrinsicOp(FD, opcode, group))
4963+
return false;
4964+
4965+
if (opcode != (unsigned)hlsl::IntrinsicOp::MOP_Store)
4966+
return false;
4967+
4968+
// Note that the store op is not yet an HL op. It's just a call
4969+
// to mangled rwbab store function. So adjust the store val position.
4970+
const unsigned storeValOpIdx = HLOperandIndex::kStoreValOpIdx - 1;
4971+
4972+
if (storeValOpIdx >= IRCallArgs.size()) {
4973+
DXASSERT_NOMSG(storeValOpIdx < IRCallArgs.size());
4974+
return false;
4975+
}
4976+
4977+
return HLMatrixType::isa(IRCallArgs[storeValOpIdx]->getType());
4978+
}
4979+
4980+
void CGMSHLSLRuntime::EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
4981+
SmallVector<llvm::Value*, 16>& IRCallArgs,
4982+
llvm::SmallVector<clang::QualType, 16>& ArgTys) {
4983+
4984+
// Note that the store op is not yet an HL op. It's just a call
4985+
// to mangled rwbab store function. So adjust the store val position.
4986+
const unsigned storeValOpIdx = HLOperandIndex::kStoreValOpIdx - 1;
4987+
4988+
if (storeValOpIdx >= IRCallArgs.size() ||
4989+
storeValOpIdx >= ArgTys.size()) {
4990+
DXASSERT_NOMSG(storeValOpIdx < IRCallArgs.size());
4991+
DXASSERT_NOMSG(storeValOpIdx < ArgTys.size());
4992+
return;
4993+
}
4994+
4995+
if (!hlsl::IsHLSLMatType(ArgTys[storeValOpIdx]))
4996+
return;
4997+
4998+
bool isRowMajor =
4999+
hlsl::IsHLSLMatRowMajor(ArgTys[storeValOpIdx], m_pHLModule->GetHLOptions().bDefaultRowMajor);
5000+
5001+
if (!isRowMajor) {
5002+
IRCallArgs[storeValOpIdx] = EmitHLSLMatrixOperationCallImp(
5003+
CGF.Builder, HLOpcodeGroup::HLCast,
5004+
static_cast<unsigned>(HLCastOpcode::RowMatrixToColMatrix),
5005+
IRCallArgs[storeValOpIdx]->getType(), { IRCallArgs[storeValOpIdx] }, TheModule);
5006+
}
5007+
}
5008+
49505009
Value *CGMSHLSLRuntime::EmitHLSLMatrixLoad(CodeGenFunction &CGF, Value *Ptr,
49515010
QualType Ty) {
49525011
return EmitHLSLMatrixLoad(CGF.Builder, Ptr, Ty);

tools/clang/lib/CodeGen/CGHLSLRuntime.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,13 @@ class CGHLSLRuntime {
145145
llvm::BasicBlock *loopExit) = 0;
146146

147147
virtual void MarkScopeEnd(CodeGenFunction &CGF) = 0;
148+
149+
virtual bool NeedHLSLMartrixCastForStoreOp(const clang::Decl* TD,
150+
llvm::SmallVector<llvm::Value*, 16>& IRCallArgs) = 0;
151+
152+
virtual void EmitHLSLMartrixCastForStoreOp(CodeGenFunction& CGF,
153+
llvm::SmallVector<llvm::Value*, 16>& IRCallArgs,
154+
llvm::SmallVector<clang::QualType, 16>& ArgTys) = 0;
148155
};
149156

150157
/// Create an instance of a HLSL runtime class.

tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/rwbab_incomplete_mat_store_const_init_zpc.hlsl

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST1=1 %s | FileCheck %s -check-prefix=CHK_TEST1
22
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST2=1 %s | FileCheck %s -check-prefix=CHK_TEST2
33
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST3=1 %s | FileCheck %s -check-prefix=CHK_TEST3
4-
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST4=1 %s | FileCheck %s -check-prefix=CHK_TEST4 | XFail Github #3423
4+
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST4=1 %s | FileCheck %s -check-prefix=CHK_TEST4
55
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST5=1 %s | FileCheck %s -check-prefix=CHK_TEST5
66
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST6=1 %s | FileCheck %s -check-prefix=CHK_TEST6
77
// RUN: %dxc -E main -T vs_6_5 -Zpc -DTEST7=1 %s | FileCheck %s -check-prefix=CHK_TEST7
@@ -33,49 +33,49 @@ void main()
3333
float2x2 t = {1,2,3,4};
3434
#elif TEST5
3535
// CHK_TEST5: dx.op.rawBufferStore.f32
36-
// CHK_TEST5: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
36+
// CHK_TEST5: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 2.000000e+00, float 5.000000e+00
3737
// CHK_TEST5: dx.op.rawBufferStore.f32
38-
// CHK_TEST5: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00
38+
// CHK_TEST5: i32 16, i32 undef, float 3.000000e+00, float 6.000000e+00
3939
float2x3 t = {1,2,3,4,5,6};
4040
#elif TEST6
4141
// CHK_TEST6: dx.op.rawBufferStore.f32
42-
// CHK_TEST6: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
42+
// CHK_TEST6: i32 0, i32 undef, float 1.000000e+00, float 3.000000e+00, float 5.000000e+00, float 2.000000e+00
4343
// CHK_TEST6: dx.op.rawBufferStore.f32
44-
// CHK_TEST6: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00
44+
// CHK_TEST6: i32 16, i32 undef, float 4.000000e+00, float 6.000000e+00
4545
float3x2 t = {1,2,3,4,5,6};
4646
#elif TEST7
4747
// CHK_TEST7: dx.op.rawBufferStore.f32
48-
// CHK_TEST7: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
48+
// CHK_TEST7: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 7.000000e+00, float 2.000000e+00
4949
// CHK_TEST7: dx.op.rawBufferStore.f32
50-
// CHK_TEST7: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
50+
// CHK_TEST7: i32 16, i32 undef, float 5.000000e+00, float 8.000000e+00, float 3.000000e+00, float 6.000000e+00
5151
// CHK_TEST7: dx.op.rawBufferStore.f32
5252
// CHK_TEST7: i32 32, i32 undef, float 9.000000e+00
5353
float3x3 t = {1,2,3,4,5,6,7,8,9};
5454
#elif TEST8
5555
// CHK_TEST8: dx.op.rawBufferStore.f32
56-
// CHK_TEST8: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
56+
// CHK_TEST8: i32 0, i32 undef, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 2.000000e+00
5757
// CHK_TEST8: dx.op.rawBufferStore.f32
58-
// CHK_TEST8: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
58+
// CHK_TEST8: i32 16, i32 undef, float 6.000000e+00, float 1.000000e+01, float 3.000000e+00, float 7.000000e+00
5959
// CHK_TEST8: dx.op.rawBufferStore.f32
60-
// CHK_TEST8: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
60+
// CHK_TEST8: i32 32, i32 undef, float 1.100000e+01, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01
6161
float3x4 t = {1,2,3,4,5,6,7,8,9,10,11,12};
6262
#elif TEST9
6363
// CHK_TEST9: dx.op.rawBufferStore.f32
64-
// CHK_TEST9: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
64+
// CHK_TEST9: i32 0, i32 undef, float 1.000000e+00, float 4.000000e+00, float 7.000000e+00, float 1.000000e+01
6565
// CHK_TEST9: dx.op.rawBufferStore.f32
66-
// CHK_TEST9: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
66+
// CHK_TEST9: i32 16, i32 undef, float 2.000000e+00, float 5.000000e+00, float 8.000000e+00, float 1.100000e+01
6767
// CHK_TEST9: dx.op.rawBufferStore.f32
68-
// CHK_TEST9: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
68+
// CHK_TEST9: i32 32, i32 undef, float 3.000000e+00, float 6.000000e+00, float 9.000000e+00, float 1.200000e+01
6969
float4x3 t = {1,2,3,4,5,6,7,8,9,10,11,12};
7070
#else
7171
// CHK_TEST10: dx.op.rawBufferStore.f32
72-
// CHK_TEST10: i32 0, i32 undef, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00
72+
// CHK_TEST10: i32 0, i32 undef, float 1.000000e+00, float 5.000000e+00, float 9.000000e+00, float 1.300000e+01
7373
// CHK_TEST10: dx.op.rawBufferStore.f32
74-
// CHK_TEST10: i32 16, i32 undef, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00
74+
// CHK_TEST10: i32 16, i32 undef, float 2.000000e+00, float 6.000000e+00, float 1.000000e+01, float 1.400000e+01
7575
// CHK_TEST10: dx.op.rawBufferStore.f32
76-
// CHK_TEST10: i32 32, i32 undef, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01, float 1.200000e+01
76+
// CHK_TEST10: i32 32, i32 undef, float 3.000000e+00, float 7.000000e+00, float 1.100000e+01, float 1.500000e+01
7777
// CHK_TEST10: dx.op.rawBufferStore.f32
78-
// CHK_TEST10: i32 48, i32 undef, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01, float 1.600000e+01
78+
// CHK_TEST10: i32 48, i32 undef, float 4.000000e+00, float 8.000000e+00, float 1.200000e+01, float 1.600000e+01
7979
float4x4 t = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
8080
#endif
8181
buffer.Store(0, t);

0 commit comments

Comments
 (0)