Fix rawBufferVectorLoad/Store to widen min precision types to 32-bit

alsepkow · Copilot · alsepkow · commit e13d0dc06c07 · 2026-03-17T19:26:51.000-07:00
RawBufferVectorLoad/Store for min precision types (min16int, min16uint,
min16float) was emitting i16/f16 vector operations (e.g., v3i16) which
causes WARP and potentially other drivers to load/store 2 bytes per
element instead of 4. This mismatches the buffer layout when the CPU
writes 32-bit values.

Pre-SM6.9 RawBufferLoad correctly handles this by loading as i32/f32
and truncating. Apply the same pattern for SM6.9 vector variants:
- RawBufferVectorLoad: load as v_i32/v_f32, then trunc to i16/half
- RawBufferVectorStore: sext/fpext to i32/f32, then store as v_i32/v_f32

This matches the existing bool widening pattern already in TranslateBufLoad.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
@@ -4338,9 +4338,20 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
   Type *EltTy = Ty->getScalarType();
   const bool is64 = (EltTy->isIntegerTy(64) || EltTy->isDoubleTy());
   const bool isBool = EltTy->isIntegerTy(1);
+  // Check for min precision types: their alloc size (from data layout padding
+  // like i16:32, f16:32) exceeds their primitive size. RawBufferVectorLoad
+  // should use the widened type (i32/f32) to match how pre-SM6.9
+  // RawBufferLoad handles min precision (load i32, then trunc to i16).
+  const bool isMinPrec = !isBool && DL.getTypeAllocSizeInBits(EltTy) >
+                                        EltTy->getPrimitiveSizeInBits();
+  Type *OrigEltTy = EltTy;
   // Values will be loaded in memory representations.
-  if (isBool || (is64 && isTyped))
-    EltTy = Builder.getInt32Ty();
+  if (isBool || (is64 && isTyped) || isMinPrec) {
+    if (isMinPrec && EltTy->isFloatingPointTy())
+      EltTy = Builder.getFloatTy();
+    else
+      EltTy = Builder.getInt32Ty();
+  }
 
   // Calculate load size with the scalar memory element type.
   unsigned LdSize = DL.getTypeAllocSize(EltTy);
@@ -4454,6 +4465,16 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
     retValNew = Builder.CreateICmpNE(
         retValNew, Constant::getNullValue(retValNew->getType()));
 
+  // Truncate widened min precision loads back to original type.
+  // e.g., <3 x i32> from rawBufferVectorLoad.v3i32 -> <3 x i16>
+  if (isMinPrec) {
+    Type *TargetTy = Ty;
+    if (OrigEltTy->isIntegerTy())
+      retValNew = Builder.CreateTrunc(retValNew, TargetTy);
+    else
+      retValNew = Builder.CreateFPTrunc(retValNew, TargetTy);
+  }
+
   helper.retVal->replaceAllUsesWith(retValNew);
   helper.retVal = retValNew;
 
@@ -4574,6 +4595,27 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     val = Builder.CreateZExt(val, Ty);
   }
 
+  // Widen min precision types to i32/f32 for RawBufferVectorStore, matching
+  // how pre-SM6.9 RawBufferStore handles min precision (store as i32).
+  if (opcode == OP::OpCode::RawBufferVectorStore) {
+    const DataLayout &DL =
+        OP->GetModule()->GetHLModule().GetModule()->getDataLayout();
+    if (DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits()) {
+      Type *WideTy = EltTy->isFloatingPointTy() ? (Type *)Builder.getFloatTy()
+                                                : (Type *)i32Ty;
+      Type *WideVecTy =
+          Ty->isVectorTy()
+              ? (Type *)VectorType::get(WideTy, Ty->getVectorNumElements())
+              : WideTy;
+      if (EltTy->isFloatingPointTy())
+        val = Builder.CreateFPExt(val, WideVecTy);
+      else
+        val = Builder.CreateSExt(val, WideVecTy);
+      EltTy = WideTy;
+      Ty = WideVecTy;
+    }
+  }
+
   // If RawBuffer store of 64-bit value, don't set alignment to 8,
   // since buffer alignment isn't known to be anything over 4.
   unsigned alignValue = OP->GetAllocSizeForType(EltTy);
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl
@@ -0,0 +1,36 @@
+// RUN: %dxc -E main -T cs_6_9 %s | FileCheck %s
+
+// Regression test for min precision rawBufferVectorLoad/Store.
+// Min precision types should use i32/f32 vector operations (not i16/f16)
+// to match how pre-SM6.9 RawBufferLoad handles min precision.
+
+RWByteAddressBuffer g_buf : register(u0);
+
+[numthreads(1,1,1)]
+void main() {
+  // min16int: should load as v3i32, not v3i16
+  // CHECK: call %dx.types.ResRet.v3i32 @dx.op.rawBufferVectorLoad.v3i32
+  min16int3 vi = g_buf.Load< min16int3 >(0);
+  // CHECK: call void @dx.op.rawBufferVectorStore.v3i32
+  g_buf.Store< min16int3 >(12, vi);
+
+  // min16uint: should load as v3i32, not v3i16
+  // CHECK: call %dx.types.ResRet.v3i32 @dx.op.rawBufferVectorLoad.v3i32
+  min16uint3 vu = g_buf.Load< min16uint3 >(24);
+  // CHECK: call void @dx.op.rawBufferVectorStore.v3i32
+  g_buf.Store< min16uint3 >(36, vu);
+
+  // min16float: should load as v3f32, not v3f16
+  // CHECK: call %dx.types.ResRet.v3f32 @dx.op.rawBufferVectorLoad.v3f32
+  // CHECK: fptrunc <3 x float> {{.*}} to <3 x half>
+  min16float3 vf = g_buf.Load< min16float3 >(48);
+  // CHECK: fpext <3 x half> {{.*}} to <3 x float>
+  // CHECK: call void @dx.op.rawBufferVectorStore.v3f32
+  g_buf.Store< min16float3 >(60, vf);
+
+  // Verify i16/f16 vector ops are NOT used.
+  // CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}i16
+  // CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}i16
+  // CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}f16
+  // CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}f16
+}