|
1 | | -// RUN: %dxc -T cs_6_9 %s | FileCheck %s |
| 1 | +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor | FileCheck %s --check-prefixes COMMON,DXIL-0 |
| 2 | +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-1 |
| 3 | +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal | FileCheck %s --check-prefixes COMMON,DXIL-2 |
2 | 4 |
|
| 5 | +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F16 -DML=RowMajor -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-0 |
| 6 | +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=float16_t -DMI=F8_E4M3 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-1 |
| 7 | +// RUN: %dxc -T cs_6_9 %s -enable-16bit-types -DITY=uint -DMI=U8 -DML=OuterProductOptimal -fcgl | FileCheck %s --check-prefixes COMMON,HLOP-2 |
| 8 | + |
| 9 | +ByteAddressBuffer input_vector_buffer; |
| 10 | +ByteAddressBuffer input_vector_buffer2; |
3 | 11 | RWByteAddressBuffer matrix_buffer; |
4 | 12 |
|
5 | | -// CHECK: define void @main() |
6 | | -// CHECK: call void @dx.op.outerProductAccumulate.v2i32.v4i32(i32 {{[0-9]+}} |
| 13 | +// COMMON: define void @main() |
| 14 | +// DXIL-0: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64) ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride) |
| 15 | +// HLOP-0: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 365, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 8, i32 0, i32 64) |
| 16 | +// DXIL-1: call void @dx.op.outerProductAccumulate.v8f16.v8f16(i32 307, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64) ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride) |
| 17 | +// HLOP-1: call void @"dx.hl.op..void (i32, <8 x half>, <8 x half>, %dx.types.Handle, i32, i32, i32, i32)"(i32 365, <8 x half> %{{[^ ]+}}, <8 x half> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 21, i32 3, i32 64) |
| 18 | +// DXIL-2: call void @dx.op.outerProductAccumulate.v8i32.v8i32(i32 307, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64) ; OuterProductAccumulate(inputVector1,inputVector2,matrixBuffer,matrixOffset,matrixIntepretation,matrixLayout,matrixStride) |
| 19 | +// HLOP-2: call void @"dx.hl.op..void (i32, <8 x i32>, <8 x i32>, %dx.types.Handle, i32, i32, i32, i32)"(i32 365, <8 x i32> %{{[^ ]+}}, <8 x i32> %{{[^ ]+}}, %dx.types.Handle %{{[^ ]+}}, i32 0, i32 19, i32 3, i32 64) |
| 20 | + |
| 21 | +enum CompType { |
| 22 | + Invalid = 0, |
| 23 | + I1 = 1, |
| 24 | + I16 = 2, |
| 25 | + U16 = 3, |
| 26 | + I32 = 4, |
| 27 | + U32 = 5, |
| 28 | + I64 = 6, |
| 29 | + U64 = 7, |
| 30 | + F16 = 8, |
| 31 | + F32 = 9, |
| 32 | + F64 = 10, |
| 33 | + SNormF16 = 11, |
| 34 | + UNormF16 = 12, |
| 35 | + SNormF32 = 13, |
| 36 | + UNormF32 = 14, |
| 37 | + SNormF64 = 15, |
| 38 | + UNormF64 = 16, |
| 39 | + PackedS8x32 = 17, |
| 40 | + PackedU8x32 = 18, |
| 41 | + |
| 42 | + // BEGIN NEW FOR SM 6.9 |
| 43 | + U8 = 19, |
| 44 | + I8 = 20, |
| 45 | + F8_E4M3 = 21, |
| 46 | + F8_E5M2 = 22, |
| 47 | +}; |
| 48 | + |
| 49 | +enum MatLayout { |
| 50 | + RowMajor = 0, |
| 51 | + ColumnMajor = 1, |
| 52 | + MulOptimal = 2, |
| 53 | + OuterProductOptimal = 3, |
| 54 | +}; |
| 55 | + |
7 | 56 |
|
8 | 57 | [Numthreads(1,1,1)] |
9 | 58 | void main() |
10 | 59 | { |
11 | | - vector<uint, 2> input_vector1 = 1; |
12 | | - vector<uint, 4> input_vector2 = 2; |
| 60 | + vector<ITY, 8> input_vector1 = input_vector_buffer.Load<vector<ITY, 8> >(0); |
| 61 | + vector<ITY, 8> input_vector2 = input_vector_buffer2.Load<vector<ITY, 8> >(0); |
13 | 62 |
|
| 63 | + const uint matrix_interpretation = MI; |
| 64 | + const uint matrix_layout = ML; |
14 | 65 | const uint matrix_offset = 0; |
15 | | - const uint matrix_interpretation = 5; /*U32*/ |
16 | | - const uint matrix_layout = 0; |
17 | 66 | const uint matrix_stride = 64; |
18 | 67 |
|
19 | 68 | __builtin_OuterProductAccumulate(input_vector1, input_vector2, matrix_buffer, matrix_offset, matrix_interpretation, matrix_layout, matrix_stride); |
|
0 commit comments