From e13d0dc06c07200f8219deccd846a11d3d0b645d Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Tue, 17 Mar 2026 19:24:04 -0700
Subject: [PATCH 01/13] Fix rawBufferVectorLoad/Store to widen min precision
 types to 32-bit

RawBufferVectorLoad/Store for min precision types (min16int, min16uint,
min16float) was emitting i16/f16 vector operations (e.g., v3i16) which
causes WARP and potentially other drivers to load/store 2 bytes per
element instead of 4. This mismatches the buffer layout when the CPU
writes 32-bit values.

Pre-SM6.9 RawBufferLoad correctly handles this by loading as i32/f32
and truncating. Apply the same pattern for SM6.9 vector variants:
- RawBufferVectorLoad: load as v_i32/v_f32, then trunc to i16/half
- RawBufferVectorStore: sext/fpext to i32/f32, then store as v_i32/v_f32

This matches the existing bool widening pattern already in TranslateBufLoad.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/HLSL/HLOperationLower.cpp                 | 46 ++++++++++++++++++-
 .../min_precision_vector_load_store.hlsl      | 36 +++++++++++++++
 2 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 4f22a4598d..8b55de827c 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4338,9 +4338,20 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
   Type *EltTy = Ty->getScalarType();
   const bool is64 = (EltTy->isIntegerTy(64) || EltTy->isDoubleTy());
   const bool isBool = EltTy->isIntegerTy(1);
+  // Check for min precision types: their alloc size (from data layout padding
+  // like i16:32, f16:32) exceeds their primitive size. RawBufferVectorLoad
+  // should use the widened type (i32/f32) to match how pre-SM6.9
+  // RawBufferLoad handles min precision (load i32, then trunc to i16).
+  const bool isMinPrec = !isBool && DL.getTypeAllocSizeInBits(EltTy) >
+                                        EltTy->getPrimitiveSizeInBits();
+  Type *OrigEltTy = EltTy;
   // Values will be loaded in memory representations.
-  if (isBool || (is64 && isTyped))
-    EltTy = Builder.getInt32Ty();
+  if (isBool || (is64 && isTyped) || isMinPrec) {
+    if (isMinPrec && EltTy->isFloatingPointTy())
+      EltTy = Builder.getFloatTy();
+    else
+      EltTy = Builder.getInt32Ty();
+  }
 
   // Calculate load size with the scalar memory element type.
   unsigned LdSize = DL.getTypeAllocSize(EltTy);
@@ -4454,6 +4465,16 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
     retValNew = Builder.CreateICmpNE(
         retValNew, Constant::getNullValue(retValNew->getType()));
 
+  // Truncate widened min precision loads back to original type.
+  // e.g., <3 x i32> from rawBufferVectorLoad.v3i32 -> <3 x i16>
+  if (isMinPrec) {
+    Type *TargetTy = Ty;
+    if (OrigEltTy->isIntegerTy())
+      retValNew = Builder.CreateTrunc(retValNew, TargetTy);
+    else
+      retValNew = Builder.CreateFPTrunc(retValNew, TargetTy);
+  }
+
   helper.retVal->replaceAllUsesWith(retValNew);
   helper.retVal = retValNew;
 
@@ -4574,6 +4595,27 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     val = Builder.CreateZExt(val, Ty);
   }
 
+  // Widen min precision types to i32/f32 for RawBufferVectorStore, matching
+  // how pre-SM6.9 RawBufferStore handles min precision (store as i32).
+  if (opcode == OP::OpCode::RawBufferVectorStore) {
+    const DataLayout &DL =
+        OP->GetModule()->GetHLModule().GetModule()->getDataLayout();
+    if (DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits()) {
+      Type *WideTy = EltTy->isFloatingPointTy() ? (Type *)Builder.getFloatTy()
+                                                : (Type *)i32Ty;
+      Type *WideVecTy =
+          Ty->isVectorTy()
+              ? (Type *)VectorType::get(WideTy, Ty->getVectorNumElements())
+              : WideTy;
+      if (EltTy->isFloatingPointTy())
+        val = Builder.CreateFPExt(val, WideVecTy);
+      else
+        val = Builder.CreateSExt(val, WideVecTy);
+      EltTy = WideTy;
+      Ty = WideVecTy;
+    }
+  }
+
   // If RawBuffer store of 64-bit value, don't set alignment to 8,
   // since buffer alignment isn't known to be anything over 4.
   unsigned alignValue = OP->GetAllocSizeForType(EltTy);
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl
new file mode 100644
index 0000000000..6748357d43
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl
@@ -0,0 +1,36 @@
+// RUN: %dxc -E main -T cs_6_9 %s | FileCheck %s
+
+// Regression test for min precision rawBufferVectorLoad/Store.
+// Min precision types should use i32/f32 vector operations (not i16/f16)
+// to match how pre-SM6.9 RawBufferLoad handles min precision.
+
+RWByteAddressBuffer g_buf : register(u0);
+
+[numthreads(1,1,1)]
+void main() {
+  // min16int: should load as v3i32, not v3i16
+  // CHECK: call %dx.types.ResRet.v3i32 @dx.op.rawBufferVectorLoad.v3i32
+  min16int3 vi = g_buf.Load< min16int3 >(0);
+  // CHECK: call void @dx.op.rawBufferVectorStore.v3i32
+  g_buf.Store< min16int3 >(12, vi);
+
+  // min16uint: should load as v3i32, not v3i16
+  // CHECK: call %dx.types.ResRet.v3i32 @dx.op.rawBufferVectorLoad.v3i32
+  min16uint3 vu = g_buf.Load< min16uint3 >(24);
+  // CHECK: call void @dx.op.rawBufferVectorStore.v3i32
+  g_buf.Store< min16uint3 >(36, vu);
+
+  // min16float: should load as v3f32, not v3f16
+  // CHECK: call %dx.types.ResRet.v3f32 @dx.op.rawBufferVectorLoad.v3f32
+  // CHECK: fptrunc <3 x float> {{.*}} to <3 x half>
+  min16float3 vf = g_buf.Load< min16float3 >(48);
+  // CHECK: fpext <3 x half> {{.*}} to <3 x float>
+  // CHECK: call void @dx.op.rawBufferVectorStore.v3f32
+  g_buf.Store< min16float3 >(60, vf);
+
+  // Verify i16/f16 vector ops are NOT used.
+  // CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}i16
+  // CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}i16
+  // CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}f16
+  // CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}f16
+}

From 9defa02fbf4d91021f4bb40e22bf16c4556a1e18 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Wed, 25 Mar 2026 17:02:03 -0700
Subject: [PATCH 02/13] Fix min16uint debug test to accept named SSA values

The min precision widening changes introduce trunc instructions that
cause SSA values to get names like %.i08 instead of numeric %6. Widen
the FileCheck regex from %{{[0-9]+}} to %{{[^ ,]+}} to accept both.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../dxil/debug/min16/min16uint_vec.hlsl          | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16uint_vec.hlsl b/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16uint_vec.hlsl
index e09a944a44..488c0385f9 100644
--- a/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16uint_vec.hlsl
+++ b/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16uint_vec.hlsl
@@ -16,20 +16,20 @@ void main()
 {
     Foo foo = buf[0];
     // foo.m_B.x
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 96, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 48, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 96, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 48, 16)
 
     // foo.m_B.y
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 128, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 64, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 128, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 64, 16)
 
     // foo.m_B.z
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 160, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 80, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 160, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 80, 16)
 
     // foo.m_A.x
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
 
     min16int value1 = foo.m_B.x;
     min16int value2 = foo.m_B.y;

From b9f249a4b31e89933409c94cb661ade850bb48bd Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Wed, 25 Mar 2026 17:04:13 -0700
Subject: [PATCH 03/13] Fix min16int_vec and min16float_vec debug tests for
 named SSA values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Same fix as min16uint_vec — widen regex from %{{[0-9]+}} to %{{[^ ,]+}}
to accept named SSA values introduced by min precision widening.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../dxil/debug/min16/min16float_vec.hlsl         | 16 ++++++++--------
 .../dxil/debug/min16/min16int_vec.hlsl           | 16 ++++++++--------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16float_vec.hlsl b/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16float_vec.hlsl
index 60fff4a6df..5dae57d261 100644
--- a/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16float_vec.hlsl
+++ b/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16float_vec.hlsl
@@ -16,20 +16,20 @@ void main()
 {
     Foo foo = buf[0];
     // foo.m_B.x
-    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 96, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 48, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 96, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 48, 16)
 
     // foo.m_B.y
-    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 128, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 64, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 128, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 64, 16)
 
     // foo.m_B.z
-    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 160, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 80, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 160, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 80, 16)
 
     // foo.m_A.x
-    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata half %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
 
     min16float value1 = foo.m_B.x;
     min16float value2 = foo.m_B.y;
diff --git a/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16int_vec.hlsl b/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16int_vec.hlsl
index a16a006b76..b7af2cf87d 100644
--- a/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16int_vec.hlsl
+++ b/tools/clang/test/HLSLFileCheck/dxil/debug/min16/min16int_vec.hlsl
@@ -16,20 +16,20 @@ void main()
 {
     Foo foo = buf[0];
     // foo.m_B.x
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 96, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 48, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 96, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 48, 16)
 
     // foo.m_B.y
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 128, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 64, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 128, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 64, 16)
 
     // foo.m_B.z
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 160, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 80, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 160, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 80, 16)
 
     // foo.m_A.x
-    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
-    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[0-9]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
+    // CHECK-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
+    // CHECK16-DAG: call void @llvm.dbg.value(metadata i16 %{{[^ ,]+}}, i64 0, metadata !{{[0-9]+}}, metadata !{{[0-9]+}}), !dbg !{{[0-9]+}} ; var:"foo" !DIExpression(DW_OP_bit_piece, 0, 16)
 
     min16int value1 = foo.m_B.x;
     min16int value2 = foo.m_B.y;

From 7c95fb087874ed1f377aef873692bf17462cd3bd Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Wed, 25 Mar 2026 17:24:07 -0700
Subject: [PATCH 04/13] Address review: extract helpers, simplify code, shorten
 comments

- Extract isMinPrecisionType() and widenMinPrecisionType() helpers
- Remove unused TargetTy variable
- Shorten verbose comments
- Simplify store widening using helper function

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/HLSL/HLOperationLower.cpp | 54 ++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 23 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 54ccb391b1..1a1df72024 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4322,6 +4322,26 @@ static SmallVector<Value *, 10> GetBufLoadArgs(ResLoadHelper helper,
   return Args;
 }
 
+// Returns true if EltTy is a min precision type whose padded alloc size
+// exceeds its primitive size (e.g., i16:32, f16:32 in the data layout).
+static bool isMinPrecisionType(Type *EltTy, const DataLayout &DL) {
+  return !EltTy->isIntegerTy(1) &&
+         DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits();
+}
+
+// Widens a min precision element type to its 32-bit equivalent (i32 or f32).
+// Returns the original type if not min precision.
+static Type *widenMinPrecisionType(Type *EltTy, Type *VecOrScalarTy,
+                                   IRBuilder<> &Builder, const DataLayout &DL) {
+  if (!isMinPrecisionType(EltTy, DL))
+    return VecOrScalarTy;
+  Type *WideTy = EltTy->isFloatingPointTy() ? (Type *)Builder.getFloatTy()
+                                            : (Type *)Builder.getInt32Ty();
+  if (VecOrScalarTy->isVectorTy())
+    return VectorType::get(WideTy, VecOrScalarTy->getVectorNumElements());
+  return WideTy;
+}
+
 // Emits as many calls as needed to load the full vector
 // Performs any needed extractions and conversions of the results.
 Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
@@ -4338,12 +4358,8 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
   Type *EltTy = Ty->getScalarType();
   const bool is64 = (EltTy->isIntegerTy(64) || EltTy->isDoubleTy());
   const bool isBool = EltTy->isIntegerTy(1);
-  // Check for min precision types: their alloc size (from data layout padding
-  // like i16:32, f16:32) exceeds their primitive size. RawBufferVectorLoad
-  // should use the widened type (i32/f32) to match how pre-SM6.9
-  // RawBufferLoad handles min precision (load i32, then trunc to i16).
-  const bool isMinPrec = !isBool && DL.getTypeAllocSizeInBits(EltTy) >
-                                        EltTy->getPrimitiveSizeInBits();
+  // Min precision alloc size exceeds prim size. Use the widened type.
+  const bool isMinPrec = isMinPrecisionType(EltTy, DL);
   Type *OrigEltTy = EltTy;
   // Values will be loaded in memory representations.
   if (isBool || (is64 && isTyped) || isMinPrec) {
@@ -4466,13 +4482,11 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
         retValNew, Constant::getNullValue(retValNew->getType()));
 
   // Truncate widened min precision loads back to original type.
-  // e.g., <3 x i32> from rawBufferVectorLoad.v3i32 -> <3 x i16>
   if (isMinPrec) {
-    Type *TargetTy = Ty;
     if (OrigEltTy->isIntegerTy())
-      retValNew = Builder.CreateTrunc(retValNew, TargetTy);
+      retValNew = Builder.CreateTrunc(retValNew, Ty);
     else
-      retValNew = Builder.CreateFPTrunc(retValNew, TargetTy);
+      retValNew = Builder.CreateFPTrunc(retValNew, Ty);
   }
 
   helper.retVal->replaceAllUsesWith(retValNew);
@@ -4595,24 +4609,18 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     val = Builder.CreateZExt(val, Ty);
   }
 
-  // Widen min precision types to i32/f32 for RawBufferVectorStore, matching
-  // how pre-SM6.9 RawBufferStore handles min precision (store as i32).
+  // Widen min precision types to i32/f32 for RawBufferVectorStore.
   if (opcode == OP::OpCode::RawBufferVectorStore) {
     const DataLayout &DL =
         OP->GetModule()->GetHLModule().GetModule()->getDataLayout();
-    if (DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits()) {
-      Type *WideTy = EltTy->isFloatingPointTy() ? (Type *)Builder.getFloatTy()
-                                                : (Type *)i32Ty;
-      Type *WideVecTy =
-          Ty->isVectorTy()
-              ? (Type *)VectorType::get(WideTy, Ty->getVectorNumElements())
-              : WideTy;
+    Type *WideTy = widenMinPrecisionType(EltTy, Ty, Builder, DL);
+    if (WideTy != Ty) {
       if (EltTy->isFloatingPointTy())
-        val = Builder.CreateFPExt(val, WideVecTy);
+        val = Builder.CreateFPExt(val, WideTy);
       else
-        val = Builder.CreateSExt(val, WideVecTy);
-      EltTy = WideTy;
-      Ty = WideVecTy;
+        val = Builder.CreateSExt(val, WideTy);
+      EltTy = WideTy->getScalarType();
+      Ty = WideTy;
     }
   }
 

From 4302236af05b0000fa3300295d6fdbb08112d55f Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alexsepkowski@gmail.com>
Date: Wed, 25 Mar 2026 17:33:39 -0700
Subject: [PATCH 05/13] Apply suggestions from code review

Co-authored-by: Alex Sepkowski <alexsepkowski@gmail.com>
---
 lib/HLSL/HLOperationLower.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 1a1df72024..9bc2df21fb 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4322,8 +4322,6 @@ static SmallVector<Value *, 10> GetBufLoadArgs(ResLoadHelper helper,
   return Args;
 }
 
-// Returns true if EltTy is a min precision type whose padded alloc size
-// exceeds its primitive size (e.g., i16:32, f16:32 in the data layout).
 static bool isMinPrecisionType(Type *EltTy, const DataLayout &DL) {
   return !EltTy->isIntegerTy(1) &&
          DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits();

From 8e7c6fb4553818502de069a6f0e5c994764ff76b Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alexsepkowski@gmail.com>
Date: Thu, 26 Mar 2026 10:03:04 -0700
Subject: [PATCH 06/13] Update lib/HLSL/HLOperationLower.cpp

Co-authored-by: Tex Riddell <texr@microsoft.com>
---
 lib/HLSL/HLOperationLower.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 9bc2df21fb..f083a58526 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4360,8 +4360,13 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
   const bool isMinPrec = isMinPrecisionType(EltTy, DL);
   Type *OrigEltTy = EltTy;
   // Values will be loaded in memory representations.
-  if (isBool || (is64 && isTyped) || isMinPrec) {
-    if (isMinPrec && EltTy->isFloatingPointTy())
+  // If bool (i1), load from memory-representation (i32),
+  // or if 64-bits and typed, load i32 chunks, then reconstruct values.
+  if (isBool || (is64 && isTyped)) {
+    EltTy = Builder.getInt32Ty();
+  } else if (isMinPrec) {
+    // If min-precision, load raw value as 32-bit type.
+    if (EltTy->isFloatingPointTy())
       EltTy = Builder.getFloatTy();
     else
       EltTy = Builder.getInt32Ty();

From 41f37f9cde53ee22b731d785f35f621c0d401038 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:22:33 -0700
Subject: [PATCH 07/13] Address review: refactor widenMinPrecisionType, extend
 to RawBufferStore

- widenMinPrecisionType now takes a single Type* (vector or scalar) and
  LLVMContext instead of separate EltTy/VecOrScalarTy and IRBuilder.
- Load path uses widenMinPrecisionType upfront, eliminating the
  else-if-isMinPrec branch per tex3d's suggestion.
- Store widening now covers both RawBufferStore and RawBufferVectorStore.
  Without this, scalar min-precision stores crash with a cast type mismatch.
- Added scalar RawBufferLoad/Store test cases for all 3 min-precision types.
- Added TODO(#8314) for SExt/ZExt signedness issue.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/HLSL/HLOperationLower.cpp                 | 49 +++++++++----------
 .../min_precision_vector_load_store.hlsl      | 32 ++++++++++--
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index f083a58526..0a69f0d626 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4327,16 +4327,18 @@ static bool isMinPrecisionType(Type *EltTy, const DataLayout &DL) {
          DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits();
 }
 
-// Widens a min precision element type to its 32-bit equivalent (i32 or f32).
-// Returns the original type if not min precision.
-static Type *widenMinPrecisionType(Type *EltTy, Type *VecOrScalarTy,
-                                   IRBuilder<> &Builder, const DataLayout &DL) {
+// Widens a min precision type to its 32-bit equivalent (i32 or f32).
+// Accepts vector or scalar types. Returns the original type if not min
+// precision.
+static Type *widenMinPrecisionType(Type *Ty, LLVMContext &Ctx,
+                                   const DataLayout &DL) {
+  Type *EltTy = Ty->getScalarType();
   if (!isMinPrecisionType(EltTy, DL))
-    return VecOrScalarTy;
-  Type *WideTy = EltTy->isFloatingPointTy() ? (Type *)Builder.getFloatTy()
-                                            : (Type *)Builder.getInt32Ty();
-  if (VecOrScalarTy->isVectorTy())
-    return VectorType::get(WideTy, VecOrScalarTy->getVectorNumElements());
+    return Ty;
+  Type *WideTy = EltTy->isFloatingPointTy() ? Type::getFloatTy(Ctx)
+                                            : Type::getInt32Ty(Ctx);
+  if (Ty->isVectorTy())
+    return VectorType::get(WideTy, Ty->getVectorNumElements());
   return WideTy;
 }
 
@@ -4353,24 +4355,16 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
     NumComponents = Ty->getVectorNumElements();
 
   const bool isTyped = DXIL::IsTyped(RK);
-  Type *EltTy = Ty->getScalarType();
+  Type *OrigEltTy = Ty->getScalarType();
+  Type *WidenedTy = widenMinPrecisionType(Ty, Builder.getContext(), DL);
+  Type *EltTy = WidenedTy->getScalarType();
+  const bool isMinPrec = (WidenedTy != Ty);
   const bool is64 = (EltTy->isIntegerTy(64) || EltTy->isDoubleTy());
   const bool isBool = EltTy->isIntegerTy(1);
-  // Min precision alloc size exceeds prim size. Use the widened type.
-  const bool isMinPrec = isMinPrecisionType(EltTy, DL);
-  Type *OrigEltTy = EltTy;
-  // Values will be loaded in memory representations.
   // If bool (i1), load from memory-representation (i32),
   // or if 64-bits and typed, load i32 chunks, then reconstruct values.
-  if (isBool || (is64 && isTyped)) {
+  if (isBool || (is64 && isTyped))
     EltTy = Builder.getInt32Ty();
-  } else if (isMinPrec) {
-    // If min-precision, load raw value as 32-bit type.
-    if (EltTy->isFloatingPointTy())
-      EltTy = Builder.getFloatTy();
-    else
-      EltTy = Builder.getInt32Ty();
-  }
 
   // Calculate load size with the scalar memory element type.
   unsigned LdSize = DL.getTypeAllocSize(EltTy);
@@ -4612,15 +4606,20 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     val = Builder.CreateZExt(val, Ty);
   }
 
-  // Widen min precision types to i32/f32 for RawBufferVectorStore.
-  if (opcode == OP::OpCode::RawBufferVectorStore) {
+  // Widen min precision types to i32/f32 for raw buffer stores.
+  // Min precision types have 32-bit alloc size, so the address math and
+  // store intrinsic must use 32-bit values to match.
+  if (opcode == OP::OpCode::RawBufferStore ||
+      opcode == OP::OpCode::RawBufferVectorStore) {
     const DataLayout &DL =
         OP->GetModule()->GetHLModule().GetModule()->getDataLayout();
-    Type *WideTy = widenMinPrecisionType(EltTy, Ty, Builder, DL);
+    Type *WideTy = widenMinPrecisionType(Ty, Builder.getContext(), DL);
     if (WideTy != Ty) {
       if (EltTy->isFloatingPointTy())
         val = Builder.CreateFPExt(val, WideTy);
       else
+        // TODO(#8314): Signedness info is lost by this point; SExt is wrong
+        // for min16uint. Front-end should widen during Clang CodeGen instead.
         val = Builder.CreateSExt(val, WideTy);
       EltTy = WideTy->getScalarType();
       Ty = WideTy;
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl
index 6748357d43..38485f7de1 100644
--- a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl
+++ b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl
@@ -1,13 +1,15 @@
 // RUN: %dxc -E main -T cs_6_9 %s | FileCheck %s
 
-// Regression test for min precision rawBufferVectorLoad/Store.
-// Min precision types should use i32/f32 vector operations (not i16/f16)
+// Regression test for min precision rawBufferLoad/Store.
+// Min precision types should use i32/f32 operations (not i16/f16)
 // to match how pre-SM6.9 RawBufferLoad handles min precision.
 
 RWByteAddressBuffer g_buf : register(u0);
 
 [numthreads(1,1,1)]
 void main() {
+  // === Vector loads/stores (RawBufferVectorLoad/Store) ===
+
   // min16int: should load as v3i32, not v3i16
   // CHECK: call %dx.types.ResRet.v3i32 @dx.op.rawBufferVectorLoad.v3i32
   min16int3 vi = g_buf.Load< min16int3 >(0);
@@ -28,9 +30,33 @@ void main() {
   // CHECK: call void @dx.op.rawBufferVectorStore.v3f32
   g_buf.Store< min16float3 >(60, vf);
 
-  // Verify i16/f16 vector ops are NOT used.
+  // === Scalar loads/stores (RawBufferLoad/Store) ===
+
+  // min16int scalar: should use i32 rawBufferStore
+  // CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32
+  min16int si = g_buf.Load< min16int >(72);
+  // CHECK: call void @dx.op.rawBufferStore.i32
+  g_buf.Store< min16int >(76, si);
+
+  // min16uint scalar: should use i32 rawBufferStore
+  // CHECK: call %dx.types.ResRet.i32 @dx.op.rawBufferLoad.i32
+  min16uint su = g_buf.Load< min16uint >(80);
+  // CHECK: call void @dx.op.rawBufferStore.i32
+  g_buf.Store< min16uint >(84, su);
+
+  // min16float scalar: should use f32 rawBufferStore
+  // CHECK: call %dx.types.ResRet.f32 @dx.op.rawBufferLoad.f32
+  min16float sf = g_buf.Load< min16float >(88);
+  // CHECK: call void @dx.op.rawBufferStore.f32
+  g_buf.Store< min16float >(92, sf);
+
+  // Verify i16/f16 ops are NOT used.
   // CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}i16
   // CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}i16
   // CHECK-NOT: rawBufferVectorLoad.v{{[0-9]+}}f16
   // CHECK-NOT: rawBufferVectorStore.v{{[0-9]+}}f16
+  // CHECK-NOT: rawBufferLoad.i16
+  // CHECK-NOT: rawBufferStore.i16
+  // CHECK-NOT: rawBufferLoad.f16
+  // CHECK-NOT: rawBufferStore.f16
 }

From df53605e6723160c73b4ca9bfb100cb8e1e05ee4 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Mon, 30 Mar 2026 16:52:07 -0700
Subject: [PATCH 08/13] Remove redundant comment on widenMinPrecisionType

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/HLSL/HLOperationLower.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 0a69f0d626..198e3f07b9 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4327,9 +4327,6 @@ static bool isMinPrecisionType(Type *EltTy, const DataLayout &DL) {
          DL.getTypeAllocSizeInBits(EltTy) > EltTy->getPrimitiveSizeInBits();
 }
 
-// Widens a min precision type to its 32-bit equivalent (i32 or f32).
-// Accepts vector or scalar types. Returns the original type if not min
-// precision.
 static Type *widenMinPrecisionType(Type *Ty, LLVMContext &Ctx,
                                    const DataLayout &DL) {
   Type *EltTy = Ty->getScalarType();

From d6b3294c40c2a83bbc507fc9aa102d462f3142aa Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Mon, 30 Mar 2026 17:48:52 -0700
Subject: [PATCH 09/13] Trim verbose comments to explain why, not what

Shorten 3 comments in HLOperationLower.cpp that re-described code
behavior. Each now explains the underlying reason in 1 line.
The TODO(#8314) comment was already concise and left unchanged.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/HLSL/HLOperationLower.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 198e3f07b9..82de4db25b 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4358,8 +4358,7 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
   const bool isMinPrec = (WidenedTy != Ty);
   const bool is64 = (EltTy->isIntegerTy(64) || EltTy->isDoubleTy());
   const bool isBool = EltTy->isIntegerTy(1);
-  // If bool (i1), load from memory-representation (i32),
-  // or if 64-bits and typed, load i32 chunks, then reconstruct values.
+  // DXIL buffer loads require i32; narrow types are reconverted after load.
   if (isBool || (is64 && isTyped))
     EltTy = Builder.getInt32Ty();
 
@@ -4475,7 +4474,7 @@ Value *TranslateBufLoad(ResLoadHelper &helper, HLResource::Kind RK,
     retValNew = Builder.CreateICmpNE(
         retValNew, Constant::getNullValue(retValNew->getType()));
 
-  // Truncate widened min precision loads back to original type.
+  // DXIL loads min precision as 32-bit; narrow back to original IR type.
   if (isMinPrec) {
     if (OrigEltTy->isIntegerTy())
       retValNew = Builder.CreateTrunc(retValNew, Ty);
@@ -4603,9 +4602,7 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
     val = Builder.CreateZExt(val, Ty);
   }
 
-  // Widen min precision types to i32/f32 for raw buffer stores.
-  // Min precision types have 32-bit alloc size, so the address math and
-  // store intrinsic must use 32-bit values to match.
+  // Min precision alloc size is 32-bit; widen to match store intrinsic.
   if (opcode == OP::OpCode::RawBufferStore ||
       opcode == OP::OpCode::RawBufferVectorStore) {
     const DataLayout &DL =

From 14396aebc71acf05bf8a1a008645f68064da62a4 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Mon, 30 Mar 2026 18:17:12 -0700
Subject: [PATCH 10/13] Rename test to min_precision_raw_load_store.hlsl

The test covers both vector and scalar paths, so drop 'vector' from
the file name to accurately reflect scope.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 external/SPIRV-Headers                        |   2 +-
 external/SPIRV-Tools                          |   2 +-
 ...hlsl => min_precision_raw_load_store.hlsl} |   0
 utils/hct/setup_agility_sdk.py                | 245 ++++++++++++++++++
 4 files changed, 247 insertions(+), 2 deletions(-)
 rename tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/{min_precision_vector_load_store.hlsl => min_precision_raw_load_store.hlsl} (100%)
 create mode 100644 utils/hct/setup_agility_sdk.py

diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index 465055f6c9..04f10f650d 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit 465055f6c9128772e20082e893d974146acf7a02
+Subproject commit 04f10f650d514df88b76d25e83db360142c7b174
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index 8a13595dd4..fbe4f3ad91 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit 8a13595dd4ae5049ef42d0f30297d0c427db54b5
+Subproject commit fbe4f3ad913c44fe8700545f8ffe35d1382b7093
diff --git a/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl b/tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_raw_load_store.hlsl
similarity index 100%
rename from tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_vector_load_store.hlsl
rename to tools/clang/test/HLSLFileCheck/hlsl/objects/ByteAddressBuffer/min_precision_raw_load_store.hlsl
diff --git a/utils/hct/setup_agility_sdk.py b/utils/hct/setup_agility_sdk.py
new file mode 100644
index 0000000000..1c4ab233c2
--- /dev/null
+++ b/utils/hct/setup_agility_sdk.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+"""Setup Agility SDK binaries for DXC execution tests.
+
+Automates downloading and installing the latest D3D12 Agility SDK binaries
+into the TAEF directory of an hlsl.bin build tree, so exec tests can run
+with the latest D3D12 runtime.
+
+Usage:
+    python setup_agility_sdk.py [hlsl_bin_dir] [options]
+
+Examples:
+    python setup_agility_sdk.py F:\\hlsl.bin
+    python setup_agility_sdk.py F:\\hlsl.bin --sdk-type preview
+    python setup_agility_sdk.py --overwrite
+"""
+
+import argparse
+import glob
+import logging
+import os
+import shutil
+import sys
+import tempfile
+import zipfile
+
+DEFAULT_HLSL_BIN = r"F:\hlsl.bin"
+NETWORK_SHARE = r"\\GRFXSHARE\Sigma-GRFX\Users\amarp\IHVDrops"
+ZIP_PATTERN = "D3D12_AgilitySDK_preview_*"
+AGILITY_DLLS = ["D3D12Core.dll", "D3D12SDKLayers.dll"]
+
+log = logging.getLogger("setup_agility_sdk")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Setup Agility SDK binaries for DXC execution tests.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=(
+            "examples:\n"
+            "  %(prog)s F:\\hlsl.bin\n"
+            "  %(prog)s F:\\hlsl.bin --sdk-type preview\n"
+            "  %(prog)s --overwrite\n"
+        ),
+    )
+    parser.add_argument(
+        "hlsl_bin_dir",
+        nargs="?",
+        default=None,
+        help="Path to the hlsl.bin build directory (default: %(default)s).",
+    )
+    parser.add_argument(
+        "--sdk-type",
+        choices=["experimental", "preview"],
+        default="experimental",
+        help="Agility SDK flavor to install (default: experimental).",
+    )
+    parser.add_argument(
+        "--arch",
+        choices=["x64", "x86", "ARM64"],
+        default="x64",
+        help="Target architecture (default: x64).",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Overwrite existing Agility SDK binaries even if already present.",
+    )
+    return parser.parse_args()
+
+
+def validate_hlsl_bin(hlsl_bin_dir):
+    """Validate that the hlsl.bin directory exists."""
+    if not os.path.isdir(hlsl_bin_dir):
+        log.error("hlsl.bin directory does not exist: %s", hlsl_bin_dir)
+        sys.exit(1)
+    log.info("Using hlsl.bin directory: %s", hlsl_bin_dir)
+
+
+def validate_taef_dir(hlsl_bin_dir, arch):
+    """Validate that the TAEF/<arch> directory exists under hlsl.bin."""
+    taef_dir = os.path.join(hlsl_bin_dir, "TAEF", arch)
+    if not os.path.isdir(taef_dir):
+        log.error(
+            "TAEF directory not found: %s\n"
+            "  Make sure you have run hctstart.cmd to initialize the build environment.",
+            taef_dir,
+        )
+        sys.exit(1)
+    log.info("Found TAEF directory: %s", taef_dir)
+    return taef_dir
+
+
+def check_existing_sdk(taef_dir):
+    """Check if D3D12 Agility SDK DLLs already exist. Returns the D3D12 dir path."""
+    d3d12_dir = os.path.join(taef_dir, "D3D12")
+    if not os.path.isdir(d3d12_dir):
+        log.info("D3D12 directory does not exist yet: %s", d3d12_dir)
+        return d3d12_dir, False
+
+    missing = [f for f in AGILITY_DLLS if not os.path.isfile(os.path.join(d3d12_dir, f))]
+    if missing:
+        log.info("Agility SDK incomplete, missing: %s", ", ".join(missing))
+        return d3d12_dir, False
+
+    log.info("Agility SDK binaries already present in: %s", d3d12_dir)
+    return d3d12_dir, True
+
+
+def access_network_share():
+    """Verify network share is accessible."""
+    log.info("Checking network share access: %s", NETWORK_SHARE)
+    if not os.path.isdir(NETWORK_SHARE):
+        log.error(
+            "Cannot access network share: %s\n"
+            "  Must have corpnet or VPN access.",
+            NETWORK_SHARE,
+        )
+        sys.exit(1)
+    log.info("Network share is accessible.")
+
+
+def find_newest_zip():
+    """Find the newest Agility SDK zip on the network share."""
+    pattern = os.path.join(NETWORK_SHARE, ZIP_PATTERN + ".zip")
+    zips = sorted(glob.glob(pattern))
+    if not zips:
+        log.error("No Agility SDK zips found matching: %s", pattern)
+        sys.exit(1)
+    newest = zips[-1]
+    log.info("Found %d SDK zip(s). Using newest: %s", len(zips), os.path.basename(newest))
+    return newest
+
+
+def extract_and_copy(zip_path, sdk_type, arch, d3d12_dir):
+    """Extract the SDK zip to a temp directory and copy binaries."""
+    tmp_dir = tempfile.mkdtemp(prefix="agility_sdk_")
+    try:
+        log.info("Extracting %s to temp directory...", os.path.basename(zip_path))
+        with zipfile.ZipFile(zip_path, "r") as zf:
+            zf.extractall(tmp_dir)
+
+        # The zip extracts into a top-level directory named like the zip (without .zip).
+        # Find the actual extraction root.
+        top_dirs = [
+            d for d in os.listdir(tmp_dir) if os.path.isdir(os.path.join(tmp_dir, d))
+        ]
+
+        # Source path: <extract_root>/<sdk_type>/<arch>/sdkbin/
+        # Try with and without a top-level wrapper directory.
+        candidates = [os.path.join(tmp_dir, sdk_type, arch, "sdkbin")]
+        for td in top_dirs:
+            candidates.insert(0, os.path.join(tmp_dir, td, sdk_type, arch, "sdkbin"))
+
+        src_dir = None
+        for c in candidates:
+            if os.path.isdir(c):
+                src_dir = c
+                break
+
+        if src_dir is None:
+            log.error(
+                "Could not find SDK binaries in extracted zip.\n"
+                "  Expected path: <zip_root>/%s/%s/sdkbin/\n"
+                "  Searched:\n    %s",
+                sdk_type,
+                arch,
+                "\n    ".join(candidates),
+            )
+            sys.exit(1)
+
+        log.info("Found SDK binaries at: %s", src_dir)
+
+        # Create destination D3D12 directory if needed.
+        os.makedirs(d3d12_dir, exist_ok=True)
+
+        # Copy only the Agility SDK DLLs and their PDBs.
+        target_stems = {os.path.splitext(f)[0].lower() for f in AGILITY_DLLS}
+        copied = []
+        for fname in os.listdir(src_dir):
+            stem = os.path.splitext(fname)[0].lower()
+            ext = os.path.splitext(fname)[1].lower()
+            if stem in target_stems and ext in (".dll", ".pdb"):
+                src = os.path.join(src_dir, fname)
+                dst = os.path.join(d3d12_dir, fname)
+                shutil.copy2(src, dst)
+                copied.append(fname)
+                log.info("  Copied: %s", fname)
+
+        if not copied:
+            log.warning("No DLL/PDB files found in %s", src_dir)
+        else:
+            log.info("Copied %d file(s) to %s", len(copied), d3d12_dir)
+
+    finally:
+        log.info("Cleaning up temp directory: %s", tmp_dir)
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+def main():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(levelname)-7s %(message)s",
+    )
+
+    args = parse_args()
+
+    # Resolve hlsl.bin directory.
+    if args.hlsl_bin_dir is None:
+        log.warning(
+            "No hlsl.bin path provided, defaulting to %s", DEFAULT_HLSL_BIN
+        )
+        hlsl_bin_dir = DEFAULT_HLSL_BIN
+    else:
+        hlsl_bin_dir = args.hlsl_bin_dir
+
+    hlsl_bin_dir = os.path.abspath(hlsl_bin_dir)
+
+    # Step 1: Validate hlsl.bin.
+    validate_hlsl_bin(hlsl_bin_dir)
+
+    # Step 2: Validate TAEF directory.
+    taef_dir = validate_taef_dir(hlsl_bin_dir, args.arch)
+
+    # Step 3: Check for existing Agility SDK.
+    d3d12_dir, already_present = check_existing_sdk(taef_dir)
+    if already_present and not args.overwrite:
+        log.info("Nothing to do. Use --overwrite to force update.")
+        return
+    if already_present and args.overwrite:
+        log.info("--overwrite specified, will replace existing binaries.")
+
+    # Step 4: Access network share.
+    access_network_share()
+
+    # Step 5: Find newest SDK zip.
+    zip_path = find_newest_zip()
+
+    # Step 6-7: Extract and copy.
+    extract_and_copy(zip_path, args.sdk_type, args.arch, d3d12_dir)
+
+    log.info("Agility SDK setup complete.")
+
+
+if __name__ == "__main__":
+    main()

From 81a1f81e4551b00905beb5a2bb08d61e732d5bd8 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Mon, 30 Mar 2026 18:21:16 -0700
Subject: [PATCH 11/13] Revert accidental submodule and unrelated file changes

Restores SPIRV-Headers and SPIRV-Tools submodule pointers to their
previous state and removes setup_agility_sdk.py which was staged
unintentionally.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 external/SPIRV-Headers         |   2 +-
 external/SPIRV-Tools           |   2 +-
 utils/hct/setup_agility_sdk.py | 245 ---------------------------------
 3 files changed, 2 insertions(+), 247 deletions(-)
 delete mode 100644 utils/hct/setup_agility_sdk.py

diff --git a/external/SPIRV-Headers b/external/SPIRV-Headers
index 04f10f650d..465055f6c9 160000
--- a/external/SPIRV-Headers
+++ b/external/SPIRV-Headers
@@ -1 +1 @@
-Subproject commit 04f10f650d514df88b76d25e83db360142c7b174
+Subproject commit 465055f6c9128772e20082e893d974146acf7a02
diff --git a/external/SPIRV-Tools b/external/SPIRV-Tools
index fbe4f3ad91..8a13595dd4 160000
--- a/external/SPIRV-Tools
+++ b/external/SPIRV-Tools
@@ -1 +1 @@
-Subproject commit fbe4f3ad913c44fe8700545f8ffe35d1382b7093
+Subproject commit 8a13595dd4ae5049ef42d0f30297d0c427db54b5
diff --git a/utils/hct/setup_agility_sdk.py b/utils/hct/setup_agility_sdk.py
deleted file mode 100644
index 1c4ab233c2..0000000000
--- a/utils/hct/setup_agility_sdk.py
+++ /dev/null
@@ -1,245 +0,0 @@
-#!/usr/bin/env python3
-"""Setup Agility SDK binaries for DXC execution tests.
-
-Automates downloading and installing the latest D3D12 Agility SDK binaries
-into the TAEF directory of an hlsl.bin build tree, so exec tests can run
-with the latest D3D12 runtime.
-
-Usage:
-    python setup_agility_sdk.py [hlsl_bin_dir] [options]
-
-Examples:
-    python setup_agility_sdk.py F:\\hlsl.bin
-    python setup_agility_sdk.py F:\\hlsl.bin --sdk-type preview
-    python setup_agility_sdk.py --overwrite
-"""
-
-import argparse
-import glob
-import logging
-import os
-import shutil
-import sys
-import tempfile
-import zipfile
-
-DEFAULT_HLSL_BIN = r"F:\hlsl.bin"
-NETWORK_SHARE = r"\\GRFXSHARE\Sigma-GRFX\Users\amarp\IHVDrops"
-ZIP_PATTERN = "D3D12_AgilitySDK_preview_*"
-AGILITY_DLLS = ["D3D12Core.dll", "D3D12SDKLayers.dll"]
-
-log = logging.getLogger("setup_agility_sdk")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Setup Agility SDK binaries for DXC execution tests.",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=(
-            "examples:\n"
-            "  %(prog)s F:\\hlsl.bin\n"
-            "  %(prog)s F:\\hlsl.bin --sdk-type preview\n"
-            "  %(prog)s --overwrite\n"
-        ),
-    )
-    parser.add_argument(
-        "hlsl_bin_dir",
-        nargs="?",
-        default=None,
-        help="Path to the hlsl.bin build directory (default: %(default)s).",
-    )
-    parser.add_argument(
-        "--sdk-type",
-        choices=["experimental", "preview"],
-        default="experimental",
-        help="Agility SDK flavor to install (default: experimental).",
-    )
-    parser.add_argument(
-        "--arch",
-        choices=["x64", "x86", "ARM64"],
-        default="x64",
-        help="Target architecture (default: x64).",
-    )
-    parser.add_argument(
-        "--overwrite",
-        action="store_true",
-        help="Overwrite existing Agility SDK binaries even if already present.",
-    )
-    return parser.parse_args()
-
-
-def validate_hlsl_bin(hlsl_bin_dir):
-    """Validate that the hlsl.bin directory exists."""
-    if not os.path.isdir(hlsl_bin_dir):
-        log.error("hlsl.bin directory does not exist: %s", hlsl_bin_dir)
-        sys.exit(1)
-    log.info("Using hlsl.bin directory: %s", hlsl_bin_dir)
-
-
-def validate_taef_dir(hlsl_bin_dir, arch):
-    """Validate that the TAEF/<arch> directory exists under hlsl.bin."""
-    taef_dir = os.path.join(hlsl_bin_dir, "TAEF", arch)
-    if not os.path.isdir(taef_dir):
-        log.error(
-            "TAEF directory not found: %s\n"
-            "  Make sure you have run hctstart.cmd to initialize the build environment.",
-            taef_dir,
-        )
-        sys.exit(1)
-    log.info("Found TAEF directory: %s", taef_dir)
-    return taef_dir
-
-
-def check_existing_sdk(taef_dir):
-    """Check if D3D12 Agility SDK DLLs already exist. Returns the D3D12 dir path."""
-    d3d12_dir = os.path.join(taef_dir, "D3D12")
-    if not os.path.isdir(d3d12_dir):
-        log.info("D3D12 directory does not exist yet: %s", d3d12_dir)
-        return d3d12_dir, False
-
-    missing = [f for f in AGILITY_DLLS if not os.path.isfile(os.path.join(d3d12_dir, f))]
-    if missing:
-        log.info("Agility SDK incomplete, missing: %s", ", ".join(missing))
-        return d3d12_dir, False
-
-    log.info("Agility SDK binaries already present in: %s", d3d12_dir)
-    return d3d12_dir, True
-
-
-def access_network_share():
-    """Verify network share is accessible."""
-    log.info("Checking network share access: %s", NETWORK_SHARE)
-    if not os.path.isdir(NETWORK_SHARE):
-        log.error(
-            "Cannot access network share: %s\n"
-            "  Must have corpnet or VPN access.",
-            NETWORK_SHARE,
-        )
-        sys.exit(1)
-    log.info("Network share is accessible.")
-
-
-def find_newest_zip():
-    """Find the newest Agility SDK zip on the network share."""
-    pattern = os.path.join(NETWORK_SHARE, ZIP_PATTERN + ".zip")
-    zips = sorted(glob.glob(pattern))
-    if not zips:
-        log.error("No Agility SDK zips found matching: %s", pattern)
-        sys.exit(1)
-    newest = zips[-1]
-    log.info("Found %d SDK zip(s). Using newest: %s", len(zips), os.path.basename(newest))
-    return newest
-
-
-def extract_and_copy(zip_path, sdk_type, arch, d3d12_dir):
-    """Extract the SDK zip to a temp directory and copy binaries."""
-    tmp_dir = tempfile.mkdtemp(prefix="agility_sdk_")
-    try:
-        log.info("Extracting %s to temp directory...", os.path.basename(zip_path))
-        with zipfile.ZipFile(zip_path, "r") as zf:
-            zf.extractall(tmp_dir)
-
-        # The zip extracts into a top-level directory named like the zip (without .zip).
-        # Find the actual extraction root.
-        top_dirs = [
-            d for d in os.listdir(tmp_dir) if os.path.isdir(os.path.join(tmp_dir, d))
-        ]
-
-        # Source path: <extract_root>/<sdk_type>/<arch>/sdkbin/
-        # Try with and without a top-level wrapper directory.
-        candidates = [os.path.join(tmp_dir, sdk_type, arch, "sdkbin")]
-        for td in top_dirs:
-            candidates.insert(0, os.path.join(tmp_dir, td, sdk_type, arch, "sdkbin"))
-
-        src_dir = None
-        for c in candidates:
-            if os.path.isdir(c):
-                src_dir = c
-                break
-
-        if src_dir is None:
-            log.error(
-                "Could not find SDK binaries in extracted zip.\n"
-                "  Expected path: <zip_root>/%s/%s/sdkbin/\n"
-                "  Searched:\n    %s",
-                sdk_type,
-                arch,
-                "\n    ".join(candidates),
-            )
-            sys.exit(1)
-
-        log.info("Found SDK binaries at: %s", src_dir)
-
-        # Create destination D3D12 directory if needed.
-        os.makedirs(d3d12_dir, exist_ok=True)
-
-        # Copy only the Agility SDK DLLs and their PDBs.
-        target_stems = {os.path.splitext(f)[0].lower() for f in AGILITY_DLLS}
-        copied = []
-        for fname in os.listdir(src_dir):
-            stem = os.path.splitext(fname)[0].lower()
-            ext = os.path.splitext(fname)[1].lower()
-            if stem in target_stems and ext in (".dll", ".pdb"):
-                src = os.path.join(src_dir, fname)
-                dst = os.path.join(d3d12_dir, fname)
-                shutil.copy2(src, dst)
-                copied.append(fname)
-                log.info("  Copied: %s", fname)
-
-        if not copied:
-            log.warning("No DLL/PDB files found in %s", src_dir)
-        else:
-            log.info("Copied %d file(s) to %s", len(copied), d3d12_dir)
-
-    finally:
-        log.info("Cleaning up temp directory: %s", tmp_dir)
-        shutil.rmtree(tmp_dir, ignore_errors=True)
-
-
-def main():
-    logging.basicConfig(
-        level=logging.INFO,
-        format="%(levelname)-7s %(message)s",
-    )
-
-    args = parse_args()
-
-    # Resolve hlsl.bin directory.
-    if args.hlsl_bin_dir is None:
-        log.warning(
-            "No hlsl.bin path provided, defaulting to %s", DEFAULT_HLSL_BIN
-        )
-        hlsl_bin_dir = DEFAULT_HLSL_BIN
-    else:
-        hlsl_bin_dir = args.hlsl_bin_dir
-
-    hlsl_bin_dir = os.path.abspath(hlsl_bin_dir)
-
-    # Step 1: Validate hlsl.bin.
-    validate_hlsl_bin(hlsl_bin_dir)
-
-    # Step 2: Validate TAEF directory.
-    taef_dir = validate_taef_dir(hlsl_bin_dir, args.arch)
-
-    # Step 3: Check for existing Agility SDK.
-    d3d12_dir, already_present = check_existing_sdk(taef_dir)
-    if already_present and not args.overwrite:
-        log.info("Nothing to do. Use --overwrite to force update.")
-        return
-    if already_present and args.overwrite:
-        log.info("--overwrite specified, will replace existing binaries.")
-
-    # Step 4: Access network share.
-    access_network_share()
-
-    # Step 5: Find newest SDK zip.
-    zip_path = find_newest_zip()
-
-    # Step 6-7: Extract and copy.
-    extract_and_copy(zip_path, args.sdk_type, args.arch, d3d12_dir)
-
-    log.info("Agility SDK setup complete.")
-
-
-if __name__ == "__main__":
-    main()

From d7a076087f4d1af57abca6815be1b9fda56ab998 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Tue, 31 Mar 2026 10:25:26 -0700
Subject: [PATCH 12/13] Fix min precision store widening: scope to
 ByteAddressBuffer only

The store widening for min precision types was applying to both
RawBufferStore (scalar) and RawBufferVectorStore (vector). This
broke StructuredBuffer stores (struct_buf3.hlsl) because it replaced
the correct sext/zext from TranslateMinPrecisionRawBuffer with a
blanket sext, losing signedness info for min16uint.

Scope the RawBufferStore widening to RawBuffer (ByteAddressBuffer)
only. StructuredBuffer scalar stores are correctly handled by the
later TranslateMinPrecisionRawBuffer pass in DxilGenerationPass,
which has signedness info from struct type annotations.

ByteAddressBuffer scalar stores still need widening here because
the later pass crashes on non-struct resource types (cast<StructType>
on ByteAddressBuffer's i32 inner element).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/HLSL/HLOperationLower.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 82de4db25b..0e1b550626 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4603,8 +4603,14 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
   }
 
   // Min precision alloc size is 32-bit; widen to match store intrinsic.
-  if (opcode == OP::OpCode::RawBufferStore ||
-      opcode == OP::OpCode::RawBufferVectorStore) {
+  // StructuredBuffer scalar stores are handled by
+  // TranslateMinPrecisionRawBuffer in DxilGenerationPass, which has signedness
+  // info from struct annotations. ByteAddressBuffer (RawBuffer) scalar stores
+  // must be widened here because that later pass crashes on non-struct resource
+  // types (cast<StructType> fail).
+  if (opcode == OP::OpCode::RawBufferVectorStore ||
+      (opcode == OP::OpCode::RawBufferStore &&
+       RK == DxilResource::Kind::RawBuffer)) {
     const DataLayout &DL =
         OP->GetModule()->GetHLModule().GetModule()->getDataLayout();
     Type *WideTy = widenMinPrecisionType(Ty, Builder.getContext(), DL);

From 52a71937add742dea8764128cd3c36980f09399f Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <5620315+alsepkow@users.noreply.github.com>
Date: Tue, 31 Mar 2026 10:38:03 -0700
Subject: [PATCH 13/13] Fix min precision store widening for ByteAddressBuffer
 scalars

Move scalar RawBufferStore widening back to TranslateMinPrecisionRawBuffer
in DxilGenerationPass where it belongs, keeping HLOperationLower scoped to
RawBufferVectorStore only.

Fix crash in ReplaceMinPrecisionRawBufferStoreByType for ByteAddressBuffer:
the existing code did cast<StructType> on the inner resource element, which
fails for ByteAddressBuffer (inner element is i32, not a struct). Use
dyn_cast instead and fall back to sext when no struct annotation is
available. Preserve undef args to avoid store mask validation errors.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/HLSL/DxilGenerationPass.cpp | 75 ++++++++++++++++++++-------------
 lib/HLSL/HLOperationLower.cpp   | 11 ++---
 2 files changed, 48 insertions(+), 38 deletions(-)

diff --git a/lib/HLSL/DxilGenerationPass.cpp b/lib/HLSL/DxilGenerationPass.cpp
index c3a6ad7dfc..3930cc3f2f 100644
--- a/lib/HLSL/DxilGenerationPass.cpp
+++ b/lib/HLSL/DxilGenerationPass.cpp
@@ -993,11 +993,10 @@ void ReplaceMinPrecisionRawBufferStoreByType(
         Args.emplace_back(NewV);
       }
     } else if (FromTy->isIntegerTy()) {
-      // This case only applies to typed buffer since Store operation of byte
-      // address buffer for min precision is handled by implicit conversion on
-      // intrinsic call. Since we are extending integer, we have to know if we
-      // should sign ext or zero ext. We can do this by iterating checking the
-      // size of the element at struct type and comp type at type annotation
+      // Since we are extending integer, we have to know if we should sign ext
+      // or zero ext. For StructuredBuffers we get signedness from the struct
+      // type annotation. For ByteAddressBuffer (raw buffers) there is no struct
+      // annotation, so we fall back to sext as a conservative default.
       CallInst *handleCI = dyn_cast<CallInst>(
           CI->getArgOperand(DxilInst_RawBufferStore::arg_uav));
       DXASSERT(handleCI,
@@ -1007,34 +1006,50 @@ void ReplaceMinPrecisionRawBufferStoreByType(
                "otherwise fail to handle for buffer store lost its retTy");
       StructType *STy = dyn_cast<StructType>(resTyIt->second);
 
-      STy = cast<StructType>(STy->getElementType(0));
-      DxilStructAnnotation *SAnnot = typeSys.GetStructAnnotation(STy);
-      ConstantInt *offsetInt = dyn_cast<ConstantInt>(
-          CI->getArgOperand(DxilInst_RawBufferStore::arg_elementOffset));
-      unsigned offset = offsetInt->getSExtValue();
-      unsigned currentOffset = 0;
-      for (DxilStructTypeIterator iter = begin(STy, SAnnot),
-                                  ItEnd = end(STy, SAnnot);
-           iter != ItEnd; ++iter) {
-        std::pair<Type *, DxilFieldAnnotation *> pair = *iter;
-        currentOffset += DL.getTypeAllocSize(pair.first);
-        if (currentOffset > offset) {
-          if (pair.second->GetCompType().IsUIntTy()) {
-            for (unsigned i = 4; i < 8; ++i) {
-              Value *NewV = CIBuilder.CreateZExt(CI->getArgOperand(i), ToTy);
-              Args.emplace_back(NewV);
+      StructType *InnerSTy =
+          STy ? dyn_cast<StructType>(STy->getElementType(0)) : nullptr;
+      DxilStructAnnotation *SAnnot =
+          InnerSTy ? typeSys.GetStructAnnotation(InnerSTy) : nullptr;
+
+      if (SAnnot) {
+        // StructuredBuffer path: use struct annotation to determine signedness.
+        ConstantInt *offsetInt = dyn_cast<ConstantInt>(
+            CI->getArgOperand(DxilInst_RawBufferStore::arg_elementOffset));
+        unsigned offset = offsetInt->getSExtValue();
+        unsigned currentOffset = 0;
+        for (DxilStructTypeIterator iter = begin(InnerSTy, SAnnot),
+                                    ItEnd = end(InnerSTy, SAnnot);
+             iter != ItEnd; ++iter) {
+          std::pair<Type *, DxilFieldAnnotation *> pair = *iter;
+          currentOffset += DL.getTypeAllocSize(pair.first);
+          if (currentOffset > offset) {
+            if (pair.second->GetCompType().IsUIntTy()) {
+              for (unsigned i = 4; i < 8; ++i) {
+                Value *NewV = CIBuilder.CreateZExt(CI->getArgOperand(i), ToTy);
+                Args.emplace_back(NewV);
+              }
+              break;
+            } else if (pair.second->GetCompType().IsIntTy()) {
+              for (unsigned i = 4; i < 8; ++i) {
+                Value *NewV = CIBuilder.CreateSExt(CI->getArgOperand(i), ToTy);
+                Args.emplace_back(NewV);
+              }
+              break;
+            } else {
+              DXASSERT(false, "Invalid comp type");
             }
-            break;
-          } else if (pair.second->GetCompType().IsIntTy()) {
-            for (unsigned i = 4; i < 8; ++i) {
-              Value *NewV = CIBuilder.CreateSExt(CI->getArgOperand(i), ToTy);
-              Args.emplace_back(NewV);
-            }
-            break;
-          } else {
-            DXASSERT(false, "Invalid comp type");
           }
         }
+      } else {
+        // ByteAddressBuffer path: no struct annotation available, so
+        // signedness is unknown. Default to sext.
+        for (unsigned i = 4; i < 8; ++i) {
+          Value *Arg = CI->getArgOperand(i);
+          if (isa<UndefValue>(Arg))
+            Args.emplace_back(UndefValue::get(ToTy));
+          else
+            Args.emplace_back(CIBuilder.CreateSExt(Arg, ToTy));
+        }
       }
     }
 
diff --git a/lib/HLSL/HLOperationLower.cpp b/lib/HLSL/HLOperationLower.cpp
index 0e1b550626..2705fc99af 100644
--- a/lib/HLSL/HLOperationLower.cpp
+++ b/lib/HLSL/HLOperationLower.cpp
@@ -4603,14 +4603,9 @@ void TranslateStore(DxilResource::Kind RK, Value *handle, Value *val,
   }
 
   // Min precision alloc size is 32-bit; widen to match store intrinsic.
-  // StructuredBuffer scalar stores are handled by
-  // TranslateMinPrecisionRawBuffer in DxilGenerationPass, which has signedness
-  // info from struct annotations. ByteAddressBuffer (RawBuffer) scalar stores
-  // must be widened here because that later pass crashes on non-struct resource
-  // types (cast<StructType> fail).
-  if (opcode == OP::OpCode::RawBufferVectorStore ||
-      (opcode == OP::OpCode::RawBufferStore &&
-       RK == DxilResource::Kind::RawBuffer)) {
+  // Scalar RawBufferStore widening is handled by TranslateMinPrecisionRawBuffer
+  // in DxilGenerationPass, which has signedness info from struct annotations.
+  if (opcode == OP::OpCode::RawBufferVectorStore) {
     const DataLayout &DL =
         OP->GetModule()->GetHLModule().GetModule()->getDataLayout();
     Type *WideTy = widenMinPrecisionType(Ty, Builder.getContext(), DL);