Skip to content

Commit 2bdf33e

Browse files
author
Greg Roth
committed
Enable trivial native vector Dxil Operations plus a few
This enables the generation of native vector DXIL Operations that are "trivial", meaning they take only a single DXOp Call instruction to implement as well as a few others that either only took such a call and some llvm operations or were of particular interest for other reasons. This involves allowing the overloads by adding the vector indication in hctdb, altering the lowering to maintain the vectors instead of scalarizing them, and a few sundry changes to fix issues along the way. The "trivial" dxil operations that return a different value from the overload type had to be moved out of the way and given their own lowering function so that the main function could generate vectors conditional on the version and vector type. These will be added in a later change. While the long vector supporting intrinsics that weren't given this treatment will continue to generate scalarized operations, some of them needed some work as well. The dot product for float vectors longer than 4 had to take the integer fallback path, which required some small modificaitons and a rename. Additionally, a heuristic for pow that malfunctioned with too many elements had to have a limit placed on it. Since the or()/and()/select() intrinsics translate directly to LLVM ops, they can have their lowering scalarization removed and what future scalarization might be needed by the current version can be done by later passes as with other LLVM operators. An issue with a special value used to represent unassined dimensions had to be addressed since new dimensions can exceed that value. It's now MAX_INT. Contributes to #7120, but I'd prefer to leave it open until all intrinsics are covered
1 parent 0ffd60a commit 2bdf33e

12 files changed

Lines changed: 1356 additions & 108 deletions

lib/HLSL/HLOperationLower.cpp

Lines changed: 147 additions & 94 deletions
Large diffs are not rendered by default.

tools/clang/lib/Sema/SemaHLSL.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6606,7 +6606,7 @@ bool HLSLExternalSource::MatchArguments(
66066606
argTypes.clear();
66076607
const bool isVariadic = IsVariadicIntrinsicFunction(pIntrinsic);
66086608

6609-
static const UINT UnusedSize = 0xFF;
6609+
static const UINT UnusedSize = UINT_MAX;
66106610
static const BYTE MaxIntrinsicArgs = g_MaxIntrinsicParamCount + 1;
66116611
#define CAB(cond, arg) \
66126612
{ \
@@ -6622,7 +6622,7 @@ bool HLSLExternalSource::MatchArguments(
66226622
ArBasicKind
66236623
ComponentType[MaxIntrinsicArgs]; // Component type for each argument,
66246624
// AR_BASIC_UNKNOWN if unspecified.
6625-
UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UNUSED_INDEX32
6625+
UINT uSpecialSize[IA_SPECIAL_SLOTS]; // row/col matching types, UnusedSize
66266626
// if unspecified.
66276627
badArgIdx = MaxIntrinsicArgs;
66286628

tools/clang/test/CodeGenDXIL/hlsl/types/longvec-intrinsics.hlsl

Lines changed: 391 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
// RUN: %dxc -T ps_6_9 %s | FileCheck %s
2+
3+
// Long vector tests for vec ops that scalarize to something more complex
4+
// than a simple repetition of the same dx.op calls.
5+
6+
StructuredBuffer< vector<float, 8> > buf;
7+
ByteAddressBuffer rbuf;
8+
9+
float4 main(uint i : SV_PrimitiveID, bool b : B) : SV_Target {
10+
vector<float, 8> vec1 = rbuf.Load< vector<float, 8> >(i++*32);
11+
vector<float, 8> vec2 = rbuf.Load< vector<float, 8> >(i++*32);
12+
vector<float, 8> vec3 = rbuf.Load< vector<float, 8> >(i++*32);
13+
14+
// CHECK: fdiv fast <8 x float>
15+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
16+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
17+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
18+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
19+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
20+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
21+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
22+
// CHECK: call float @dx.op.unary.f32(i32 17, float %{{.*}}) ; Atan(value)
23+
// CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
24+
// CHECK: fadd fast <8 x float> %{{.*}}, <float 0x
25+
// CHECK: fcmp fast olt <8 x float>
26+
// CHECK: fcmp fast oeq <8 x float>
27+
// CHECK: fcmp fast oge <8 x float>
28+
// CHECK: fcmp fast olt <8 x float>
29+
// CHECK: and <8 x i1>
30+
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
31+
// CHECK: and <8 x i1>
32+
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
33+
// CHECK: and <8 x i1>
34+
// CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
35+
// CHECK: and <8 x i1>
36+
// CHECK: select <8 x i1> %{{.*}}, <8 x float> <float 0x
37+
vec1 = atan2(vec1, vec2);
38+
39+
40+
// CHECK: fdiv fast <8 x float>
41+
// CHECK: fsub fast <8 x float> <float
42+
// CHECK: fcmp fast oge <8 x float>
43+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
44+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
45+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
46+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
47+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
48+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
49+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
50+
// CHECK: call float @dx.op.unary.f32(i32 6, float %{{.*}}) ; FAbs(value)
51+
52+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
53+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
54+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
55+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
56+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
57+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
58+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
59+
// CHECK: call float @dx.op.unary.f32(i32 22, float %{{.*}}) ; Frc(value)
60+
61+
// CHECK: fsub fast <8 x float> <float
62+
// CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float>
63+
// CHECK: fmul fast <8 x float>
64+
vec1 = fmod(vec1, vec2);
65+
66+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
67+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
68+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
69+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
70+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
71+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
72+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
73+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
74+
// CHECK: fmul fast <8 x float>
75+
vec1 = ldexp(vec1, vec2);
76+
77+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
78+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
79+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
80+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
81+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
82+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
83+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
84+
// CHECK: call float @dx.op.unary.f32(i32 23, float %{{.*}}) ; Log(value)
85+
// CHECK: fmul fast <8 x float>
86+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
87+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
88+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
89+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
90+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
91+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
92+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
93+
// CHECK: call float @dx.op.unary.f32(i32 21, float %{{.*}}) ; Exp(value)
94+
vec1 = pow(vec1, vec2);
95+
96+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
97+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
98+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
99+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
100+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
101+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
102+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
103+
// CHECK: call float @dx.op.unary.f32(i32 29, float %{{.*}}) ; Round_z(value)
104+
// CHECK: fsub fast <8 x float>
105+
vec1 = modf(vec1, vec2);
106+
107+
// CHECK: fmul fast float
108+
// CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
109+
// CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
110+
// CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
111+
// CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
112+
// CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
113+
// CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
114+
// CHECK: call float @dx.op.tertiary.f32(i32 46, float %{{.*}}, float %{{.*}}, float %{{.*}}) ; FMad(a,b,c)
115+
vec1 = dot(vec1, vec2);
116+
117+
vector<bool, 8> bvec = b;
118+
// CHECK: or i1
119+
// CHECK: or i1
120+
// CHECK: or i1
121+
// CHECK: or i1
122+
// CHECK: or i1
123+
// CHECK: or i1
124+
// CHECK: or i1
125+
bvec &= any(vec1);
126+
127+
// CHECK: and i1
128+
// CHECK: and i1
129+
// CHECK: and i1
130+
// CHECK: and i1
131+
// CHECK: and i1
132+
// CHECK: and i1
133+
// CHECK: and i1
134+
bvec &= all(vec2);
135+
136+
// call {{.*}} @dx.op.wave
137+
// call {{.*}} @dx.op.wave
138+
// call {{.*}} @dx.op.wave
139+
// call {{.*}} @dx.op.wave
140+
// call {{.*}} @dx.op.wave
141+
// call {{.*}} @dx.op.wave
142+
// call {{.*}} @dx.op.wave
143+
// call {{.*}} @dx.op.wave
144+
// call {{.*}} @dx.op.wave
145+
return WaveMatch(bvec);
146+
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max -DOP=35 -DNUM=7 %s | FileCheck %s
2+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max -DOP=35 -DNUM=1022 %s | FileCheck %s
3+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min -DOP=36 -DNUM=7 %s | FileCheck %s
4+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min -DOP=36 -DNUM=1022 %s | FileCheck %s
5+
6+
// Test vector-enabled binary intrinsics that take float-like parameters and
7+
// and are "trivial" in that they can be implemented with a single call
8+
// instruction with the same parameter and return types.
9+
10+
RWByteAddressBuffer buf;
11+
12+
// CHECK-DAG: %dx.types.ResRet.[[HTY:v[0-9]*f16]] = type { <[[NUM:[0-9]*]] x half>
13+
// CHECK-DAG: %dx.types.ResRet.[[FTY:v[0-9]*f32]] = type { <[[NUM]] x float>
14+
// CHECK-DAG: %dx.types.ResRet.[[DTY:v[0-9]*f64]] = type { <[[NUM]] x double>
15+
16+
[numthreads(8,1,1)]
17+
void main() {
18+
19+
// Capture opcode number.
20+
// CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
21+
// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[OP:[0-9]*]]
22+
buf.Store(999, OP);
23+
24+
// CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
25+
26+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 0
27+
// CHECK: [[hvec1:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
28+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[HTY]] @dx.op.rawBufferVectorLoad.[[HTY]](i32 303, %dx.types.Handle [[buf]], i32 512
29+
// CHECK: [[hvec2:%.*]] = extractvalue %dx.types.ResRet.[[HTY]] [[ld]], 0
30+
vector<float16_t, NUM> hVec1 = buf.Load<vector<float16_t, NUM> >(0);
31+
vector<float16_t, NUM> hVec2 = buf.Load<vector<float16_t, NUM> >(512);
32+
33+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2048
34+
// CHECK: [[fvec1:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
35+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[FTY]] @dx.op.rawBufferVectorLoad.[[FTY]](i32 303, %dx.types.Handle [[buf]], i32 2560
36+
// CHECK: [[fvec2:%.*]] = extractvalue %dx.types.ResRet.[[FTY]] [[ld]], 0
37+
vector<float, NUM> fVec1 = buf.Load<vector<float, NUM> >(2048);
38+
vector<float, NUM> fVec2 = buf.Load<vector<float, NUM> >(2560);
39+
40+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
41+
// CHECK: [[dvec1:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
42+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[DTY]] @dx.op.rawBufferVectorLoad.[[DTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
43+
// CHECK: [[dvec2:%.*]] = extractvalue %dx.types.ResRet.[[DTY]] [[ld]], 0
44+
vector<double, NUM> dVec1 = buf.Load<vector<double, NUM> >(4096);
45+
vector<double, NUM> dVec2 = buf.Load<vector<double, NUM> >(4608);
46+
47+
// Test simple matching type overloads.
48+
49+
// CHECK-NOT: extractelement
50+
// CHECK-NOT: insertelement
51+
// CHECK: call <[[NUM]] x half> @dx.op.binary.[[HTY]](i32 [[OP]], <[[NUM]] x half> [[hvec1]], <[[NUM]] x half> [[hvec2]])
52+
vector<float16_t, NUM> hRes = FUNC(hVec1, hVec2);
53+
54+
// CHECK-NOT: extractelement
55+
// CHECK-NOT: insertelement
56+
// CHECK: call <[[NUM]] x float> @dx.op.binary.[[FTY]](i32 [[OP]], <[[NUM]] x float> [[fvec1]], <[[NUM]] x float> [[fvec2]])
57+
vector<float, NUM> fRes = FUNC(fVec1, fVec2);
58+
59+
// CHECK-NOT: extractelement
60+
// CHECK-NOT: insertelement
61+
// CHECK: call <[[NUM]] x double> @dx.op.binary.[[DTY]](i32 [[OP]], <[[NUM]] x double> [[dvec1]], <[[NUM]] x double> [[dvec2]])
62+
vector<double, NUM> dRes = FUNC(dVec1, dVec2);
63+
64+
// CHECK-NOT: extractelement
65+
// CHECK-NOT: insertelement
66+
buf.Store<vector<float16_t, NUM> >(0, hRes);
67+
buf.Store<vector<float, NUM> >(2048, fRes);
68+
buf.Store<vector<double, NUM> >(4096, dRes);
69+
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max -DOP=37 -DUOP=39 -DNUM=7 %s | FileCheck %s
2+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=max -DOP=37 -DUOP=39 -DNUM=1022 %s | FileCheck %s
3+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min -DOP=38 -DUOP=40 -DNUM=7 %s | FileCheck %s
4+
// RUN: %dxc -T cs_6_9 -enable-16bit-types -DFUNC=min -DOP=38 -DUOP=40 -DNUM=1022 %s | FileCheck %s
5+
6+
#ifndef UOP
7+
#define UOP OP
8+
#endif
9+
10+
// Test vector-enabled binary intrinsics that take signed and unsigned integer parameters of
11+
// different widths and are "trivial" in that they can be implemented with a single call
12+
// instruction with the same parameter and return types.
13+
14+
RWByteAddressBuffer buf;
15+
16+
// CHECK-DAG: %dx.types.ResRet.[[STY:v[0-9]*i16]] = type { <[[NUM:[0-9]*]] x i16>
17+
// CHECK-DAG: %dx.types.ResRet.[[ITY:v[0-9]*i32]] = type { <[[NUM]] x i32>
18+
// CHECK-DAG: %dx.types.ResRet.[[LTY:v[0-9]*i64]] = type { <[[NUM]] x i64>
19+
20+
[numthreads(8,1,1)]
21+
void main() {
22+
23+
// Capture opcode numbers.
24+
// CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
25+
// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 888, i32 undef, i32 [[OP:[0-9]*]]
26+
buf.Store(888, OP);
27+
28+
// CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
29+
// CHECK: call void @dx.op.rawBufferStore.i32(i32 140, %dx.types.Handle [[buf]], i32 999, i32 undef, i32 [[UOP:[0-9]*]]
30+
buf.Store(999, UOP);
31+
32+
// CHECK: [[buf:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle %1, %dx.types.ResourceProperties { i32 4107, i32 0 })
33+
34+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 0
35+
// CHECK: [[svec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
36+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 512
37+
// CHECK: [[svec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
38+
vector<int16_t, NUM> sVec1 = buf.Load<vector<int16_t, NUM> >(0);
39+
vector<int16_t, NUM> sVec2 = buf.Load<vector<int16_t, NUM> >(512);
40+
41+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1024
42+
// CHECK: [[usvec1:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
43+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[STY]] @dx.op.rawBufferVectorLoad.[[STY]](i32 303, %dx.types.Handle [[buf]], i32 1536
44+
// CHECK: [[usvec2:%.*]] = extractvalue %dx.types.ResRet.[[STY]] [[ld]], 0
45+
vector<uint16_t, NUM> usVec1 = buf.Load<vector<uint16_t, NUM> >(1024);
46+
vector<uint16_t, NUM> usVec2 = buf.Load<vector<uint16_t, NUM> >(1536);
47+
48+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2048
49+
// CHECK: [[ivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
50+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 2560
51+
// CHECK: [[ivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
52+
vector<int, NUM> iVec1 = buf.Load<vector<int, NUM> >(2048);
53+
vector<int, NUM> iVec2 = buf.Load<vector<int, NUM> >(2560);
54+
55+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3072
56+
// CHECK: [[uivec1:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
57+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[ITY]] @dx.op.rawBufferVectorLoad.[[ITY]](i32 303, %dx.types.Handle [[buf]], i32 3584
58+
// CHECK: [[uivec2:%.*]] = extractvalue %dx.types.ResRet.[[ITY]] [[ld]], 0
59+
vector<uint, NUM> uiVec1 = buf.Load<vector<uint, NUM> >(3072);
60+
vector<uint, NUM> uiVec2 = buf.Load<vector<uint, NUM> >(3584);
61+
62+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4096
63+
// CHECK: [[lvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
64+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 4608
65+
// CHECK: [[lvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
66+
vector<int64_t, NUM> lVec1 = buf.Load<vector<int64_t, NUM> >(4096);
67+
vector<int64_t, NUM> lVec2 = buf.Load<vector<int64_t, NUM> >(4608);
68+
69+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5120
70+
// CHECK: [[ulvec1:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
71+
// CHECK: [[ld:%.*]] = call %dx.types.ResRet.[[LTY]] @dx.op.rawBufferVectorLoad.[[LTY]](i32 303, %dx.types.Handle [[buf]], i32 5632
72+
// CHECK: [[ulvec2:%.*]] = extractvalue %dx.types.ResRet.[[LTY]] [[ld]], 0
73+
vector<uint64_t, NUM> ulVec1 = buf.Load<vector<uint64_t, NUM> >(5120);
74+
vector<uint64_t, NUM> ulVec2 = buf.Load<vector<uint64_t, NUM> >(5632);
75+
76+
// Test simple matching type overloads.
77+
78+
// CHECK-NOT: extractelement
79+
// CHECK-NOT: insertelement
80+
// CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[OP]], <[[NUM]] x i16> [[svec1]], <[[NUM]] x i16> [[svec2]])
81+
vector<int16_t, NUM> sRes = FUNC(sVec1, sVec2);
82+
83+
// CHECK-NOT: extractelement
84+
// CHECK-NOT: insertelement
85+
// CHECK: call <[[NUM]] x i16> @dx.op.binary.[[STY]](i32 [[UOP]], <[[NUM]] x i16> [[usvec1]], <[[NUM]] x i16> [[usvec2]])
86+
vector<uint16_t, NUM> usRes = FUNC(usVec1, usVec2);
87+
88+
// CHECK-NOT: extractelement
89+
// CHECK-NOT: insertelement
90+
// CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[OP]], <[[NUM]] x i32> [[ivec1]], <[[NUM]] x i32> [[ivec2]])
91+
vector<int, NUM> iRes = FUNC(iVec1, iVec2);
92+
93+
// CHECK-NOT: extractelement
94+
// CHECK-NOT: insertelement
95+
// CHECK: call <[[NUM]] x i32> @dx.op.binary.[[ITY]](i32 [[UOP]], <[[NUM]] x i32> [[uivec1]], <[[NUM]] x i32> [[uivec2]])
96+
vector<uint, NUM> uiRes = FUNC(uiVec1, uiVec2);
97+
98+
// CHECK-NOT: extractelement
99+
// CHECK-NOT: insertelement
100+
// CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[OP]], <[[NUM]] x i64> [[lvec1]], <[[NUM]] x i64> [[lvec2]])
101+
vector<int64_t, NUM> lRes = FUNC(lVec1, lVec2);
102+
103+
// CHECK-NOT: extractelement
104+
// CHECK-NOT: insertelement
105+
// CHECK: call <[[NUM]] x i64> @dx.op.binary.[[LTY]](i32 [[UOP]], <[[NUM]] x i64> [[ulvec1]], <[[NUM]] x i64> [[ulvec2]])
106+
vector<uint64_t, NUM> ulRes = FUNC(ulVec1, ulVec2);
107+
108+
// CHECK-NOT: extractelement
109+
// CHECK-NOT: insertelement
110+
buf.Store<vector<int16_t, NUM> >(0, sRes);
111+
buf.Store<vector<uint16_t, NUM> >(1024, usRes);
112+
buf.Store<vector<int, NUM> >(2048, iRes);
113+
buf.Store<vector<uint, NUM> >(3072, uiRes);
114+
buf.Store<vector<int64_t, NUM> >(4096, lRes);
115+
buf.Store<vector<uint64_t, NUM> >(5120, ulRes);
116+
}

0 commit comments

Comments
 (0)