Skip to content

Commit 0c95d20

Browse files
author
Greg Roth
committed
Lower native vector raw buffers load/stores into new ops
When the loaded/stored type is a vector of more than 1 element, the shader model is 6.9 or higher, and the operation is on a raw buffer, enable the generation of a native vector raw buffer load or store. Incidental removal of unused parameter in load translation add validation and compute shader tests
1 parent 53f5b21 commit 0c95d20

6 files changed

Lines changed: 949 additions & 33 deletions

File tree

lib/HLSL/HLOperationLower.cpp

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3953,6 +3953,11 @@ struct ResLoadHelper {
39533953
: intrinsicOpCode(IntrinsicOp::Num_Intrinsics), handle(h), retVal(Inst),
39543954
addr(idx), offset(Offset), status(nullptr), mipLevel(mip) {
39553955
opcode = LoadOpFromResKind(RK);
3956+
Type *Ty = Inst->getType();
3957+
if (opcode == OP::OpCode::RawBufferLoad && Ty->isVectorTy() &&
3958+
Ty->getVectorNumElements() > 1 &&
3959+
Inst->getModule()->GetHLModule().GetShaderModel()->IsSM69Plus())
3960+
opcode = OP::OpCode::RawBufferVectorLoad;
39563961
}
39573962
OP::OpCode opcode;
39583963
IntrinsicOp intrinsicOpCode;
@@ -4022,6 +4027,14 @@ ResLoadHelper::ResLoadHelper(CallInst *CI, DxilResource::Kind RK,
40224027
if (RC == DxilResourceBase::Class::SRV)
40234028
OffsetIdx = IsMS ? HLOperandIndex::kTex2DMSLoadOffsetOpIdx
40244029
: HLOperandIndex::kTexLoadOffsetOpIdx;
4030+
} else if (opcode == OP::OpCode::RawBufferLoad) {
4031+
// If native vectors are available and this load had a vector
4032+
// with more than one elements, convert the RawBufferLod to the
4033+
// native vector variant RawBufferVectorLoad.
4034+
Type *Ty = CI->getType();
4035+
if (Ty->isVectorTy() && Ty->getVectorNumElements() > 1 &&
4036+
CI->getModule()->GetHLModule().GetShaderModel()->IsSM69Plus())
4037+
opcode = OP::OpCode::RawBufferVectorLoad;
40254038
}
40264039

40274040
// Set offset.
@@ -4079,7 +4092,7 @@ Value *GenerateRawBufLd(Value *handle, Value *bufIdx, Value *offset,
40794092
// Sets up arguments for buffer load call.
40804093
static SmallVector<Value *, 10> GetBufLoadArgs(ResLoadHelper helper,
40814094
HLResource::Kind RK,
4082-
IRBuilder<> Builder, Type *EltTy,
4095+
IRBuilder<> Builder,
40834096
unsigned LdSize) {
40844097
OP::OpCode opcode = helper.opcode;
40854098
llvm::Constant *opArg = Builder.getInt32((uint32_t)opcode);
@@ -4127,6 +4140,7 @@ static SmallVector<Value *, 10> GetBufLoadArgs(ResLoadHelper helper,
41274140
// If not TextureLoad, it could be a typed or raw buffer load.
41284141
// They have mostly similar arguments.
41294142
DXASSERT(opcode == OP::OpCode::RawBufferLoad ||
4143+
opcode == OP::OpCode::RawBufferVectorLoad ||
41304144
opcode == OP::OpCode::BufferLoad,
41314145
"Wrong opcode in get load args");
41324146
Args.emplace_back(
@@ -4137,6 +4151,9 @@ static SmallVector<Value *, 10> GetBufLoadArgs(ResLoadHelper helper,
41374151
// Unlike typed buffer load, raw buffer load has mask and alignment.
41384152
Args.emplace_back(nullptr); // Mask will be added later %4.
41394153
Args.emplace_back(alignmentVal); // alignment @5.
4154+
} else if (opcode == OP::OpCode::RawBufferVectorLoad) {
4155+
// RawBufferVectorLoad takes just alignment, no mask.
4156+
Args.emplace_back(alignmentVal); // alignment @4
41404157
}
41414158
}
41424159
return Args;
@@ -4162,18 +4179,19 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
41624179
if (isBool || (is64 && isTyped))
41634180
EltTy = Builder.getInt32Ty();
41644181

4165-
// 64-bit types are stored as int32 pairs in typed buffers.
4182+
// Adjust number of components as needed.
41664183
if (is64 && isTyped) {
4184+
// 64-bit types are stored as int32 pairs in typed buffers.
41674185
DXASSERT(NumComponents <= 2, "Typed buffers only allow 4 dwords.");
41684186
NumComponents *= 2;
4187+
} else if (opcode == OP::OpCode::RawBufferVectorLoad) {
4188+
// Native vector loads only have a single vector element in ResRet.
4189+
EltTy = VectorType::get(EltTy, NumComponents);
4190+
NumComponents = 1;
41694191
}
41704192

41714193
unsigned LdSize = DL.getTypeAllocSize(EltTy);
4172-
4173-
SmallVector<Value *, 4> Elts(NumComponents);
4174-
4175-
SmallVector<Value *, 10> Args =
4176-
GetBufLoadArgs(helper, RK, Builder, EltTy, LdSize);
4194+
SmallVector<Value *, 10> Args = GetBufLoadArgs(helper, RK, Builder, LdSize);
41774195

41784196
// Keep track of the first load for debug info migration.
41794197
Value *FirstLd = nullptr;
@@ -4185,9 +4203,10 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
41854203
else if (RK == DxilResource::Kind::StructuredBuffer)
41864204
OffsetIdx = DXIL::OperandIndex::kRawBufferLoadElementOffsetOpIdx;
41874205

4188-
// Create calls to function object.
4206+
// Create call(s) to function object and collect results in Elts.
41894207
// Typed buffer loads are limited to one load of up to 4 32-bit values.
41904208
// Raw buffer loads might need multiple loads in chunks of 4.
4209+
SmallVector<Value *, 4> Elts(NumComponents);
41914210
for (unsigned i = 0; i < NumComponents;) {
41924211
// Load 4 elements or however many less than 4 are left to load.
41934212
unsigned chunkSize = std::min(NumComponents - i, 4U);
@@ -4197,7 +4216,7 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
41974216
Args[DXIL::OperandIndex::kRawBufferLoadMaskOpIdx] =
41984217
GetRawBufferMaskForETy(EltTy, chunkSize, OP);
41994218
// If we've loaded a chunk already, update offset to next chunk.
4200-
if (FirstLd != nullptr && opcode == OP::OpCode::RawBufferLoad)
4219+
if (FirstLd != nullptr)
42014220
Args[OffsetIdx] =
42024221
Builder.CreateAdd(Args[OffsetIdx], OP->GetU32Const(4 * LdSize));
42034222
}
@@ -4206,8 +4225,13 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
42064225
Value *Ld = Builder.CreateCall(F, Args, OP::GetOpCodeName(opcode));
42074226

42084227
// Extract elements from returned ResRet.
4209-
for (unsigned j = 0; j < chunkSize; j++, i++)
4210-
Elts[i] = Builder.CreateExtractValue(Ld, j);
4228+
// Native vector loads just have one vector element in the ResRet.
4229+
// Others have up to four scalars that need to be individually extracted.
4230+
if (opcode == OP::OpCode::RawBufferVectorLoad)
4231+
Elts[i++] = Builder.CreateExtractValue(Ld, 0);
4232+
else
4233+
for (unsigned j = 0; j < chunkSize; j++, i++)
4234+
Elts[i] = Builder.CreateExtractValue(Ld, j);
42114235

42124236
// Update status.
42134237
UpdateStatus(Ld, helper.status, Builder, OP);
@@ -4245,9 +4269,10 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
42454269
}
42464270
}
42474271

4248-
// Package elements into a vector.
4272+
// Package elements into a vector as needed.
42494273
Value *retValNew = nullptr;
4250-
if (!Ty->isVectorTy()) {
4274+
// Scalar or native vector loads need not construct vectors from elements.
4275+
if (!Ty->isVectorTy() || opcode == OP::OpCode::RawBufferVectorLoad) {
42514276
retValNew = Elts[0];
42524277
} else {
42534278
retValNew = UndefValue::get(VectorType::get(EltTy, NumComponents));
@@ -4345,6 +4370,10 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
43454370
case DxilResource::Kind::StructuredBuffer:
43464371
IsTyped = false;
43474372
opcode = OP::OpCode::RawBufferStore;
4373+
// Where shader model and type allows, use vector store intrinsic.
4374+
if (OP->GetModule()->GetHLModule().GetShaderModel()->IsSM69Plus() &&
4375+
Ty->isVectorTy() && Ty->getVectorNumElements() > 1)
4376+
opcode = OP::OpCode::RawBufferVectorStore;
43484377
break;
43494378
case DxilResource::Kind::TypedBuffer:
43504379
opcode = OP::OpCode::BufferStore;
@@ -4387,7 +4416,6 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
43874416
EltTy = i32Ty;
43884417
}
43894418

4390-
Function *F = OP->GetOpFunc(opcode, EltTy);
43914419
llvm::Constant *opArg = OP->GetU32Const((unsigned)opcode);
43924420

43934421
llvm::Value *undefI =
@@ -4401,6 +4429,7 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
44014429

44024430
unsigned OffsetIdx = 0;
44034431
if (opcode == OP::OpCode::RawBufferStore ||
4432+
opcode == OP::OpCode::RawBufferVectorStore ||
44044433
opcode == OP::OpCode::BufferStore) {
44054434
// Append Coord0 (Index) value.
44064435
if (Idx->getType()->isVectorTy()) {
@@ -4420,7 +4449,6 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
44204449
OffsetIdx = storeArgs.size() - 1;
44214450

44224451
// Coord1 (Offset).
4423-
// Only relevant when storing more than 4 elements to structured buffers.
44244452
storeArgs.emplace_back(offset);
44254453
} else {
44264454
// texture store
@@ -4441,6 +4469,16 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
44414469
// TODO: support mip for texture ST
44424470
}
44434471

4472+
// RawBufferVectorStore only takes a single value and alignment arguments.
4473+
if (opcode == DXIL::OpCode::RawBufferVectorStore) {
4474+
storeArgs.emplace_back(val);
4475+
storeArgs.emplace_back(Alignment);
4476+
Function *F = OP->GetOpFunc(DXIL::OpCode::RawBufferVectorStore, Ty);
4477+
Builder.CreateCall(F, storeArgs);
4478+
return;
4479+
}
4480+
Function *F = OP->GetOpFunc(opcode, EltTy);
4481+
44444482
constexpr unsigned MaxStoreElemCount = 4;
44454483
const unsigned CompCount = Ty->isVectorTy() ? Ty->getVectorNumElements() : 1;
44464484
const unsigned StoreInstCount =
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// RUN: %dxc -DTYPE=float -DNUM=4 -T vs_6_9 %s | FileCheck %s
2+
// RUN: %dxc -DTYPE=bool -DNUM=4 -T vs_6_9 %s | FileCheck %s --check-prefixes=CHECK,I1
3+
// RUN: %dxc -DTYPE=uint64_t -DNUM=2 -T vs_6_9 %s | FileCheck %s
4+
// RUN: %dxc -DTYPE=double -DNUM=2 -T vs_6_9 %s | FileCheck %s
5+
6+
// RUN: %dxc -DTYPE=float -DNUM=6 -T vs_6_9 %s | FileCheck %s
7+
// RUN: %dxc -DTYPE=bool -DNUM=13 -T vs_6_9 %s | FileCheck %s --check-prefixes=CHECK,I1
8+
// RUN: %dxc -DTYPE=uint64_t -DNUM=24 -T vs_6_9 %s | FileCheck %s
9+
// RUN: %dxc -DTYPE=double -DNUM=32 -T vs_6_9 %s | FileCheck %s
10+
11+
///////////////////////////////////////////////////////////////////////
12+
// Test codegen for various load and store operations and conversions
13+
// for different scalar/vector buffer types and indices.
14+
///////////////////////////////////////////////////////////////////////
15+
16+
// CHECK: %dx.types.ResRet.[[VTY:v[0-9]*[a-z][0-9][0-9]]] = type { <[[NUM:[0-9]*]] x [[TYPE:[a-z_0-9]*]]>, i32 }
17+
18+
ByteAddressBuffer RoByBuf : register(t1);
19+
RWByteAddressBuffer RwByBuf : register(u1);
20+
21+
StructuredBuffer< vector<TYPE, NUM> > RoStBuf : register(t2);
22+
RWStructuredBuffer< vector<TYPE, NUM> > RwStBuf : register(u2);
23+
24+
ConsumeStructuredBuffer<vector<TYPE, NUM> > CnStBuf : register(u4);
25+
AppendStructuredBuffer<vector<TYPE, NUM> > ApStBuf : register(u5);
26+
27+
// CHECK-LABEL: define void @main
28+
[shader("vertex")]
29+
void main(uint ix[2] : IX) {
30+
// ByteAddressBuffer Tests
31+
32+
// CHECK-DAG: [[HDLROBY:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 0 }, i32 1, i1 false)
33+
// CHECK-DAG: [[HDLRWBY:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 1, i32 1, i32 0, i8 1 }, i32 1, i1 false)
34+
35+
// CHECK-DAG: [[HDLROST:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 2, i32 2, i32 0, i8 0 }, i32 2, i1 false)
36+
// CHECK-DAG: [[HDLRWST:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 2, i32 2, i32 0, i8 1 }, i32 2, i1 false)
37+
38+
// CHECK-DAG: [[HDLCON:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 4, i32 4, i32 0, i8 1 }, i32 4, i1 false)
39+
// CHECK-DAG: [[HDLAPP:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 5, i32 5, i32 0, i8 1 }, i32 5, i1 false)
40+
41+
// CHECK: [[IX0:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
42+
43+
// CHECK: [[ANHDLRWBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWBY]]
44+
// CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0]]
45+
// I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
46+
vector<TYPE, NUM> babElt1 = RwByBuf.Load< vector<TYPE, NUM> >(ix[0]);
47+
48+
// CHECK: [[ANHDLROBY:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROBY]]
49+
// CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROBY]], i32 [[IX0]]
50+
// I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
51+
vector<TYPE, NUM> babElt2 = RoByBuf.Load< vector<TYPE, NUM> >(ix[0]);
52+
53+
// I1: zext <[[NUM]] x i1> %{{.*}} to <[[NUM]] x i32>
54+
// CHECK: all void @dx.op.rawBufferVectorStore.[[VTY]](i32 304, %dx.types.Handle [[ANHDLRWBY]], i32 [[IX0]]
55+
RwByBuf.Store< vector<TYPE, NUM> >(ix[0], babElt1 + babElt2);
56+
57+
// StructuredBuffer Tests
58+
// CHECK: [[ANHDLRWST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLRWST]]
59+
// CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
60+
// I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
61+
vector<TYPE, NUM> stbElt1 = RwStBuf.Load(ix[0]);
62+
// CHECK: [[IX1:%.*]] = call i32 @dx.op.loadInput.i32(i32 4,
63+
// CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLRWST]], i32 [[IX1]]
64+
// I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
65+
vector<TYPE, NUM> stbElt2 = RwStBuf[ix[1]];
66+
67+
// CHECK: [[ANHDLROST:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLROST]]
68+
// CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX0]]
69+
// I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
70+
vector<TYPE, NUM> stbElt3 = RoStBuf.Load(ix[0]);
71+
// CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLROST]], i32 [[IX1]]
72+
// I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
73+
vector<TYPE, NUM> stbElt4 = RoStBuf[ix[1]];
74+
75+
// I1: zext <[[NUM]] x i1> %{{.*}} to <[[NUM]] x i32>
76+
// CHECK: all void @dx.op.rawBufferVectorStore.[[VTY]](i32 304, %dx.types.Handle [[ANHDLRWST]], i32 [[IX0]]
77+
RwStBuf[ix[0]] = stbElt1 + stbElt2 + stbElt3 + stbElt4;
78+
79+
// {Append/Consume}StructuredBuffer Tests
80+
// CHECK: [[ANHDLCON:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLCON]]
81+
// CHECK: [[CONIX:%.*]] = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle [[ANHDLCON]], i8 -1)
82+
// CHECK: call %dx.types.ResRet.[[VTY]] @dx.op.rawBufferVectorLoad.[[VTY]](i32 303, %dx.types.Handle [[ANHDLCON]], i32 [[CONIX]]
83+
// I1: icmp ne <[[NUM]] x i32> %{{.*}}, zeroinitializer
84+
vector<TYPE, NUM> cnElt = CnStBuf.Consume();
85+
86+
// CHECK: [[ANHDLAPP:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[HDLAPP]]
87+
// CHECK: [[APPIX:%.*]] = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle [[ANHDLAPP]], i8 1)
88+
// I1: zext <[[NUM]] x i1> %{{.*}} to <[[NUM]] x i32>
89+
// CHECK: all void @dx.op.rawBufferVectorStore.[[VTY]](i32 304, %dx.types.Handle [[ANHDLAPP]], i32 [[APPIX]]
90+
ApStBuf.Append(cnElt);
91+
}

0 commit comments

Comments
 (0)