Skip to content

Commit e13d0dc

Browse files
alsepkowCopilot
andcommitted
Fix rawBufferVectorLoad/Store to widen min precision types to 32-bit
RawBufferVectorLoad/Store for min precision types (min16int, min16uint, min16float) was emitting i16/f16 vector operations (e.g., v3i16) which causes WARP and potentially other drivers to load/store 2 bytes per element instead of 4. This mismatches the buffer layout when the CPU writes 32-bit values. Pre-SM6.9 RawBufferLoad correctly handles this by loading as i32/f32 and truncating. Apply the same pattern for SM6.9 vector variants: - RawBufferVectorLoad: load as v_i32/v_f32, then trunc to i16/half - RawBufferVectorStore: sext/fpext to i32/f32, then store as v_i32/v_f32 This matches the existing bool widening pattern already in TranslateBufLoad. Co-authored-by: Copilot <[email protected]>
1 parent fbc8aed commit e13d0dc

2 files changed

Lines changed: 80 additions & 2 deletions

File tree

lib/HLSL/HLOperationLower.cpp

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4338,9 +4338,20 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
43384338
Type *EltTy = Ty->getScalarType();
43394339
const bool is64 = (EltTy->isIntegerTy(64) || EltTy->isDoubleTy());
43404340
const bool isBool = EltTy->isIntegerTy(1);
4341+
// Check for min precision types: their alloc size (from data layout padding
4342+
// like i16:32, f16:32) exceeds their primitive size. RawBufferVectorLoad
4343+
// should use the widened type (i32/f32) to match how pre-SM6.9
4344+
// RawBufferLoad handles min precision (load i32, then trunc to i16).
4345+
const bool isMinPrec = !isBool && DL.getTypeAllocSizeInBits(EltTy) >
4346+
EltTy->getPrimitiveSizeInBits();
4347+
Type *OrigEltTy = EltTy;
43414348
// Values will be loaded in memory representations.
4342-
if (isBool || (is64 && isTyped))
4343-
EltTy = Builder.getInt32Ty();
4349+
if (isBool || (is64 && isTyped) || isMinPrec) {
4350+
if (isMinPrec && EltTy->isFloatingPointTy())
4351+
EltTy = Builder.getFloatTy();
4352+
else
4353+
EltTy = Builder.getInt32Ty();
4354+
}
43444355

43454356
// Calculate load size with the scalar memory element type.
43464357
unsigned LdSize = DL.getTypeAllocSize(EltTy);
@@ -4454,6 +4465,16 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
44544465
retValNew = Builder.CreateICmpNE(
44554466
retValNew, Constant::getNullValue(retValNew->getType()));
44564467

4468+
// Truncate widened min precision loads back to original type.
4469+
// e.g., <3 x i32> from rawBufferVectorLoad.v3i32 -> <3 x i16>
4470+
if (isMinPrec) {
4471+
Type *TargetTy = Ty;
4472+
if (OrigEltTy->isIntegerTy())
4473+
retValNew = Builder.CreateTrunc(retValNew, TargetTy);
4474+
else
4475+
retValNew = Builder.CreateFPTrunc(retValNew, TargetTy);
4476+
}
4477+
44574478
helper.retVal->replaceAllUsesWith(retValNew);
44584479
helper.retVal = retValNew;
44594480

@@ -4574,6 +4595,27 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
45744595
val = Builder.CreateZExt(val, Ty);
45754596
}
45764597

4598+
// Widen min precision types to i32/f32 for RawBufferVectorStore, matching
4599+
// how pre-SM6.9 RawBufferStore handles min precision (store as i32).
4600+
if (opcode == OP::OpCode::RawBufferVectorStore) {
4601+
const DataLayout &DL =
4602+
OP->GetModule()->GetHLModule().GetModule()->getDataLayout();
4603+
if (DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits()) {
4604+
Type *WideTy = EltTy->isFloatingPointTy() ? (Type *)Builder.getFloatTy()
4605+
: (Type *)i32Ty;
4606+
Type *WideVecTy =
4607+
Ty->isVectorTy()
4608+
? (Type *)VectorType::get(WideTy, Ty->getVectorNumElements())
4609+
: WideTy;
4610+
if (EltTy->isFloatingPointTy())
4611+
val = Builder.CreateFPExt(val, WideVecTy);
4612+
else
4613+
val = Builder.CreateSExt(val, WideVecTy);
4614+
EltTy = WideTy;
4615+
Ty = WideVecTy;
4616+
}
4617+
}
4618+
45774619
// If RawBuffer store of 64-bit value, don't set alignment to 8,
45784620
// since buffer alignment isn't known to be anything over 4.
45794621
unsigned alignValue = OP->GetAllocSizeForType(EltTy);
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// RUN: %dxc -E main -T cs_6_9 %s | FileCheck %s
2+
3+
// Regression test for min precision rawBufferVectorLoad/Store.
4+
// Min precision types should use i32/f32 vector operations (not i16/f16)
5+
// to match how pre-SM6.9 RawBufferLoad handles min precision.
6+
7+
RWByteAddressBuffer g_buf : register(u0);
8+
9+
[numthreads(1,1,1)]
10+
void main() {
11+
// min16int: should load as v3i32, not v3i16
12+
// CHECK: call %dx.types.ResRet.v3i32 @dx.op.rawBufferVectorLoad.v3i32
13+
min16int3 vi = g_buf.Load< min16int3 >(0);
14+
// CHECK: call void @dx.op.rawBufferVectorStore.v3i32
15+
g_buf.Store< min16int3 >(12, vi);
16+
17+
// min16uint: should load as v3i32, not v3i16
18+
// CHECK: call %dx.types.ResRet.v3i32 @dx.op.rawBufferVectorLoad.v3i32
19+
min16uint3 vu = g_buf.Load< min16uint3 >(24);
20+
// CHECK: call void @dx.op.rawBufferVectorStore.v3i32
21+
g_buf.Store< min16uint3 >(36, vu);
22+
23+
// min16float: should load as v3f32, not v3f16
24+
// CHECK: call %dx.types.ResRet.v3f32 @dx.op.rawBufferVectorLoad.v3f32
25+
// CHECK: fptrunc <3 x float> {{.*}} to <3 x half>
26+
min16float3 vf = g_buf.Load< min16float3 >(48);
27+
// CHECK: fpext <3 x half> {{.*}} to <3 x float>
28+
// CHECK: call void @dx.op.rawBufferVectorStore.v3f32
29+
g_buf.Store< min16float3 >(60, vf);
30+
31+
// Verify i16/f16 vector ops are NOT used.
32+
// CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}i16
33+
// CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}i16
34+
// CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}f16
35+
// CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}f16
36+
}

0 commit comments

Comments
 (0)