From b34136b9a0466b6c6957d02de36fbb3537dc5f3b Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Fri, 13 Mar 2026 23:44:21 -0700
Subject: [PATCH 1/5] Fix GVN and SROA miscompilation of min precision vector
 element access

Multiple optimization passes mishandle min precision vector types due to
DXC's padded data layout (i16:32, f16:32), where getTypeSizeInBits
returns padded sizes for vectors but primitive sizes for scalars.

Bug 1 - GVN ICE: CanCoerceMustAliasedValueToLoad computes a padded
integer type (e.g., i96 for <3 x half>) then attempts to bitcast from
the 48-bit LLVM type, triggering an assert. Fix: reject coercion when
type sizes include padding.

Bug 2 - GVN incorrect store forwarding: processLoad forwards a 'store
<3 x i16> zeroinitializer' past partial element stores because
MemoryDependence uses padded sizes for aliasing. Fix: skip store-to-load
forwarding for padded types.

Bug 3 - SROA element misindexing: getNaturalGEPRecursively uses
getTypeSizeInBits (2 bytes for i16) for element offsets while GEP uses
getTypeAllocSize (4 bytes with i16:32). Byte offset 4 (element 1) maps
to index 4/2=2 instead of 4/4=1, causing SROA to misplace or eliminate
element stores. Fix: use getTypeAllocSizeInBits consistently for vector
element sizes throughout SROA.

Fixes #8268

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/Transforms/Scalar/GVN.cpp  | 25 +++++++++++++++++++++++++
 lib/Transforms/Scalar/SROA.cpp | 14 ++++++++++----
 2 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index e3a8998372..5113563398 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -853,6 +853,20 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
       StoredVal->getType()->isArrayTy())
     return false;
 
+  // HLSL Change Begin - Don't coerce types that have padding in the data
+  // layout (e.g., min precision types where f16:32 means half is stored in 32
+  // bits). The coercion creates bitcasts between the LLVM type (based on
+  // primitive bit width) and an integer type (based on padded store size),
+  // which will fail when they differ.
+  Type *StoredValTy = StoredVal->getType();
+  uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits();
+  uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits();
+  if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits)
+    return false;
+  if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits)
+    return false;
+  // HLSL Change End
+
   // The store has to be at least as big as the load.
   if (DL.getTypeSizeInBits(StoredVal->getType()) <
         DL.getTypeSizeInBits(LoadTy))
@@ -1942,6 +1956,17 @@ bool GVN::processLoad(LoadInst *L) {
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
 
+    // HLSL Change Begin - Don't forward stores of types with data layout
+    // padding (e.g., min precision vectors where i16:32/f16:32 means elements
+    // are padded to 32 bits). MemoryDependence may incorrectly classify
+    // intermediate partial stores as non-clobbering when sizes include padding,
+    // leading to incorrect store-to-load forwarding.
+    Type *StoredTy = StoredVal->getType();
+    uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
+    if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
+      return false;
+    // HLSL Change End
+
     // The store and load are to a must-aliased pointer, but they may not
     // actually have the same type.  See if we know how to reuse the stored
     // value (depending on its type).
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 0c598b9563..eb07eb6fde 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1671,7 +1671,11 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+    // HLSL Change: Use alloc size instead of primitive type size for vector
+    // elements. DXC's data layout pads min precision types (i16:32, f16:32),
+    // so getTypeAllocSize matches the GEP offset stride while
+    // getTypeSizeInBits returns the unpadded primitive width.
+    unsigned ElementSizeInBits = DL.getTypeAllocSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8 != 0) {
       // GEPs over non-multiple of 8 size vector elements are invalid.
       return nullptr;
@@ -2134,7 +2138,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,
 
   // Try each vector type, and return the one which works.
   auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+    // HLSL Change: Use alloc size to match GEP offset stride for padded types.
+    uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType());
 
     // While the definition of LLVM vectors is bitpacked, we don't support sizes
     // that aren't byte sized.
@@ -2492,12 +2497,13 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
                   : nullptr),
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        // HLSL Change: Use alloc size to match GEP offset stride for padded types.
+        ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
     if (VecTy) {
-      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+      assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 &&
              "Only multiple-of-8 sized vector elements are viable");
       ++NumVectorized;
     }

From 70294faac4c9234eec1df9c3f0d3b3a5c63d7db7 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Sat, 14 Mar 2026 06:53:16 +0000
Subject: [PATCH 2/5] chore: autopublish 2026-03-14T06:53:16Z

---
 lib/Transforms/Scalar/SROA.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index eb07eb6fde..7f261fcfe3 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1675,7 +1675,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
     // elements. DXC's data layout pads min precision types (i16:32, f16:32),
     // so getTypeAllocSize matches the GEP offset stride while
     // getTypeSizeInBits returns the unpadded primitive width.
-    unsigned ElementSizeInBits = DL.getTypeAllocSizeInBits(VecTy->getScalarType());
+    unsigned ElementSizeInBits =
+        DL.getTypeAllocSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8 != 0) {
       // GEPs over non-multiple of 8 size vector elements are invalid.
       return nullptr;
@@ -2497,7 +2498,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
                   : nullptr),
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        // HLSL Change: Use alloc size to match GEP offset stride for padded types.
+        // HLSL Change: Use alloc size to match GEP offset stride for padded
+        // types.
         ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),

From e48fba78bacdfd46b106c8a53c4335028cbfa378 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Mon, 16 Mar 2026 13:42:02 -0700
Subject: [PATCH 3/5] Apply clang-format to SROA.cpp HLSL Change comments

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/Transforms/Scalar/SROA.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index eb07eb6fde..7f261fcfe3 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -1675,7 +1675,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
     // elements. DXC's data layout pads min precision types (i16:32, f16:32),
     // so getTypeAllocSize matches the GEP offset stride while
     // getTypeSizeInBits returns the unpadded primitive width.
-    unsigned ElementSizeInBits = DL.getTypeAllocSizeInBits(VecTy->getScalarType());
+    unsigned ElementSizeInBits =
+        DL.getTypeAllocSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8 != 0) {
       // GEPs over non-multiple of 8 size vector elements are invalid.
       return nullptr;
@@ -2497,7 +2498,8 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
                   : nullptr),
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        // HLSL Change: Use alloc size to match GEP offset stride for padded types.
+        // HLSL Change: Use alloc size to match GEP offset stride for padded
+        // types.
         ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
         BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
         OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),

From b74226c1529df5e569c6d3747dfd19e9e95c7071 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Mon, 16 Mar 2026 15:43:14 -0700
Subject: [PATCH 4/5] Add regression tests for min precision GVN and SROA fixes

Add lit tests that verify GVN and SROA handle DXC's padded min precision
types (i16:32, f16:32) correctly.

GVN tests (test/Transforms/GVN/min-precision-padding.ll):
- Verify store-to-load forwarding is blocked for padded vector types
- Verify type coercion is rejected for padded types
- Cover <3 x i16>, <3 x half>, <5 x i16>, <8 x half>

SROA tests (test/Transforms/SROA/min-precision-padding.ll):
- Verify GEP element indices use alloc size (4 bytes) not prim size (2 bytes)
- Verify all element stores survive with correct indices
- Cover <3 x i16>, <3 x half>, <5 x i16>, <8 x half>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 test/Transforms/GVN/min-precision-padding.ll  | 125 ++++++++++++++
 test/Transforms/SROA/min-precision-padding.ll | 154 ++++++++++++++++++
 2 files changed, 279 insertions(+)
 create mode 100644 test/Transforms/GVN/min-precision-padding.ll
 create mode 100644 test/Transforms/SROA/min-precision-padding.ll

diff --git a/test/Transforms/GVN/min-precision-padding.ll b/test/Transforms/GVN/min-precision-padding.ll
new file mode 100644
index 0000000000..bcbc45ae98
--- /dev/null
+++ b/test/Transforms/GVN/min-precision-padding.ll
@@ -0,0 +1,125 @@
+; RUN: opt < %s -basicaa -gvn -S | FileCheck %s
+
+; Regression test for min precision vector GVN miscompilation.
+; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not:
+;   1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad)
+;   2. Forward a zeroinitializer store past partial element stores (processLoad)
+;
+; Without the fix, GVN would forward the zeroinitializer vector load, producing
+; incorrect all-zero results for elements that were individually written.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>.
+; The store of zeroinitializer to %dst is followed by an element store to
+; %dst[0], then a vector load of %dst. GVN must not replace the vector load
+; with the zeroinitializer.
+
+; CHECK-LABEL: @test_no_forward_i16_vec3
+; CHECK: store <3 x i16> zeroinitializer
+; CHECK: store i16 %val
+; The vector load must survive — GVN must not replace it with zeroinitializer.
+; CHECK: %result = load <3 x i16>
+; CHECK: ret <3 x i16> %result
+define <3 x i16> @test_no_forward_i16_vec3(i16 %val) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
+  store i16 %val, i16* %elem0, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 2: Same pattern with <3 x half> (f16:32 padding).
+
+; CHECK-LABEL: @test_no_forward_f16_vec3
+; CHECK: store <3 x half> zeroinitializer
+; CHECK: store half %val
+; CHECK: %result = load <3 x half>
+; CHECK: ret <3 x half> %result
+define <3 x half> @test_no_forward_f16_vec3(half %val) {
+entry:
+  %dst = alloca <3 x half>, align 4
+  store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
+  %elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
+  store half %val, half* %elem0, align 4
+  %result = load <3 x half>, <3 x half>* %dst, align 4
+  ret <3 x half> %result
+}
+
+; Test 3: Multiple element stores — all must survive.
+; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer.
+
+; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems
+; CHECK: store <3 x i16> zeroinitializer
+; CHECK: store i16 %v0
+; CHECK: store i16 %v1
+; CHECK: store i16 %v2
+; CHECK: %result = load <3 x i16>
+; CHECK: ret <3 x i16> %result
+define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
+  store i16 %v0, i16* %e0, align 4
+  %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
+  store i16 %v1, i16* %e1, align 4
+  %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
+  store i16 %v2, i16* %e2, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type.
+; GVN must not attempt bitcast coercion on padded types.
+
+; CHECK-LABEL: @test_no_coerce_i16_vec3
+; CHECK: store <3 x i16>
+; The load of a different type from the same pointer must not be coerced.
+; CHECK: load
+; CHECK: ret
+define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) {
+entry:
+  %ptr = alloca <3 x i16>, align 4
+  store <3 x i16> %v, <3 x i16>* %ptr, align 4
+  %iptr = bitcast <3 x i16>* %ptr to i96*
+  %result = load i96, i96* %iptr, align 4
+  ret i96 %result
+}
+
+; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size).
+
+; CHECK-LABEL: @test_no_forward_i16_vec5
+; CHECK: store <5 x i16> zeroinitializer
+; CHECK: store i16 %val
+; CHECK: %result = load <5 x i16>
+; CHECK: ret <5 x i16> %result
+define <5 x i16> @test_no_forward_i16_vec5(i16 %val) {
+entry:
+  %dst = alloca <5 x i16>, align 4
+  store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
+  %elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
+  store i16 %val, i16* %elem0, align 4
+  %result = load <5 x i16>, <5 x i16>* %dst, align 4
+  ret <5 x i16> %result
+}
+
+; Test 6: Long vector variant — <8 x half>.
+
+; CHECK-LABEL: @test_no_forward_f16_vec8
+; CHECK: store <8 x half> zeroinitializer
+; CHECK: store half %val
+; CHECK: %result = load <8 x half>
+; CHECK: ret <8 x half> %result
+define <8 x half> @test_no_forward_f16_vec8(half %val) {
+entry:
+  %dst = alloca <8 x half>, align 4
+  store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
+  %elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
+  store half %val, half* %elem0, align 4
+  %result = load <8 x half>, <8 x half>* %dst, align 4
+  ret <8 x half> %result
+}
diff --git a/test/Transforms/SROA/min-precision-padding.ll b/test/Transforms/SROA/min-precision-padding.ll
new file mode 100644
index 0000000000..b431b5ea07
--- /dev/null
+++ b/test/Transforms/SROA/min-precision-padding.ll
@@ -0,0 +1,154 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+
+; Regression test for SROA miscompilation of min precision vector element access.
+; DXC's data layout pads i16 to 32 bits (i16:32) and f16 to 32 bits (f16:32).
+; SROA must use getTypeAllocSizeInBits (not getTypeSizeInBits) for vector element
+; stride so that GEP byte offsets map to correct element indices.
+;
+; Without the fix, SROA used getTypeSizeInBits(i16)=16 bits (2 bytes) for the
+; element stride, but GEP offsets use getTypeAllocSize(i16)=4 bytes. So byte
+; offset 4 (element 1) was mapped to index 4/2=2 instead of 4/4=1, causing
+; element stores to be misplaced or eliminated.
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Test 1: Element-wise write to <3 x i16> vector.
+; SROA must map GEP byte offsets to correct element indices using alloc size
+; (4 bytes per i16), not primitive size (2 bytes). All stores must survive
+; with correct indices, and the final vector load must be preserved.
+
+; CHECK-LABEL: @test_sroa_i16_vec3
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0
+; CHECK: store i16 %v0
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
+; CHECK: store i16 %v1
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
+; CHECK: store i16 %v2
+; CHECK: load <3 x i16>
+; CHECK: ret <3 x i16>
+define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
+  store i16 %v0, i16* %e0, align 4
+  %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
+  store i16 %v1, i16* %e1, align 4
+  %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
+  store i16 %v2, i16* %e2, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 2: Same pattern with <3 x half> (f16:32 padding).
+
+; CHECK-LABEL: @test_sroa_f16_vec3
+; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0
+; CHECK: store half %v0
+; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1
+; CHECK: store half %v1
+; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2
+; CHECK: store half %v2
+; CHECK: load <3 x half>
+; CHECK: ret <3 x half>
+define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) {
+entry:
+  %dst = alloca <3 x half>, align 4
+  store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
+  %e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
+  store half %v0, half* %e0, align 4
+  %e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1
+  store half %v1, half* %e1, align 4
+  %e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2
+  store half %v2, half* %e2, align 4
+  %result = load <3 x half>, <3 x half>* %dst, align 4
+  ret <3 x half> %result
+}
+
+; Test 3: Partial write — only element 1 is stored. SROA must index it correctly.
+
+; CHECK-LABEL: @test_sroa_i16_vec3_elem1
+; Element 1 store must be correctly placed at GEP index 1, not index 2.
+; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong).
+; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct).
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
+; CHECK: store i16 %val
+; CHECK: load <3 x i16>
+; CHECK: ret <3 x i16>
+define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
+  store i16 %val, i16* %e1, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 4: Element 2 store — verifies highest index is correct.
+
+; CHECK-LABEL: @test_sroa_i16_vec3_elem2
+; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
+; CHECK: store i16 %val
+; CHECK: load <3 x i16>
+; CHECK: ret <3 x i16>
+define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) {
+entry:
+  %dst = alloca <3 x i16>, align 4
+  store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
+  %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
+  store i16 %val, i16* %e2, align 4
+  %result = load <3 x i16>, <3 x i16>* %dst, align 4
+  ret <3 x i16> %result
+}
+
+; Test 5: Long vector — <5 x i16> (exceeds 4-element native size).
+
+; CHECK-LABEL: @test_sroa_i16_vec5
+; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0
+; CHECK: store i16 %v0
+; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1
+; CHECK: store i16 %v1
+; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4
+; CHECK: store i16 %v4
+; CHECK: load <5 x i16>
+; CHECK: ret <5 x i16>
+define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) {
+entry:
+  %dst = alloca <5 x i16>, align 4
+  store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
+  %e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
+  store i16 %v0, i16* %e0, align 4
+  %e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1
+  store i16 %v1, i16* %e1, align 4
+  %e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2
+  store i16 %v2, i16* %e2, align 4
+  %e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3
+  store i16 %v3, i16* %e3, align 4
+  %e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4
+  store i16 %v4, i16* %e4, align 4
+  %result = load <5 x i16>, <5 x i16>* %dst, align 4
+  ret <5 x i16> %result
+}
+
+; Test 6: Long vector — <8 x half>.
+
+; CHECK-LABEL: @test_sroa_f16_vec8_partial
+; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0
+; CHECK: store half %v0
+; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7
+; CHECK: store half %v7
+; CHECK: load <8 x half>
+; CHECK: ret <8 x half>
+define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) {
+entry:
+  %dst = alloca <8 x half>, align 4
+  store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
+  %e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
+  store half %v0, half* %e0, align 4
+  %e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7
+  store half %v7, half* %e7, align 4
+  %result = load <8 x half>, <8 x half>* %dst, align 4
+  ret <8 x half> %result
+}

From 75c8dd7026ca57daf45a6c27f1e57463ce2feb67 Mon Sep 17 00:00:00 2001
From: Alex Sepkowski <alsepkow@microsoft.com>
Date: Mon, 16 Mar 2026 16:49:16 -0700
Subject: [PATCH 5/5] Narrow GVN processLoad guard to allow same-type
 forwarding

The processLoad padding guard was unconditionally blocking store-to-load
forwarding for all padded vector types, including trivially correct same-type
forwarding. Narrow the check to only fire when stored and loaded types differ,
preserving defense-in-depth for cross-type forwarding while allowing valid
same-type optimizations for i16/half vectors.

Also fix the comment to accurately describe the root cause: BasicAA returns
MustAlias at offset 0 regardless of access sizes, causing partial element
stores to appear as Defs to MemoryDependence.

Add test cases verifying same-type forwarding still works for padded types.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 lib/Transforms/Scalar/GVN.cpp                | 22 ++++++++-------
 test/Transforms/GVN/min-precision-padding.ll | 28 ++++++++++++++++++++
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 5113563398..5bb7839772 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -1956,15 +1956,19 @@ bool GVN::processLoad(LoadInst *L) {
   if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
     Value *StoredVal = DepSI->getValueOperand();
 
-    // HLSL Change Begin - Don't forward stores of types with data layout
-    // padding (e.g., min precision vectors where i16:32/f16:32 means elements
-    // are padded to 32 bits). MemoryDependence may incorrectly classify
-    // intermediate partial stores as non-clobbering when sizes include padding,
-    // leading to incorrect store-to-load forwarding.
-    Type *StoredTy = StoredVal->getType();
-    uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
-    if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
-      return false;
+    // HLSL Change Begin - Don't forward stores of padded types when the load
+    // type differs (e.g., min precision vectors where i16:32/f16:32 means
+    // elements are padded to 32 bits). BasicAA returns MustAlias at offset 0
+    // regardless of access sizes, so a partial element store can appear as a
+    // Def to MemoryDependence. CanCoerceMustAliasedValueToLoad catches this,
+    // but we add a defense-in-depth check here for cross-type forwarding of
+    // padded types. Same-type forwarding is always safe.
+    if (StoredVal->getType() != L->getType()) {
+      Type *StoredTy = StoredVal->getType();
+      uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
+      if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
+        return false;
+    }
     // HLSL Change End
 
     // The store and load are to a must-aliased pointer, but they may not
diff --git a/test/Transforms/GVN/min-precision-padding.ll b/test/Transforms/GVN/min-precision-padding.ll
index bcbc45ae98..cdb51d9ce1 100644
--- a/test/Transforms/GVN/min-precision-padding.ll
+++ b/test/Transforms/GVN/min-precision-padding.ll
@@ -123,3 +123,31 @@ entry:
   %result = load <8 x half>, <8 x half>* %dst, align 4
   ret <8 x half> %result
 }
+
+; Test 7: Same-type store-to-load forwarding must still work for padded types.
+; GVN should forward %v directly — no intervening writes, same type.
+
+; CHECK-LABEL: @test_same_type_forward_i16_vec3
+; The load should be eliminated and %v returned directly.
+; CHECK-NOT: load
+; CHECK: ret <3 x i16> %v
+define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) {
+entry:
+  %ptr = alloca <3 x i16>, align 4
+  store <3 x i16> %v, <3 x i16>* %ptr, align 4
+  %result = load <3 x i16>, <3 x i16>* %ptr, align 4
+  ret <3 x i16> %result
+}
+
+; Test 8: Same-type forwarding for <3 x half>.
+
+; CHECK-LABEL: @test_same_type_forward_f16_vec3
+; CHECK-NOT: load
+; CHECK: ret <3 x half> %v
+define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) {
+entry:
+  %ptr = alloca <3 x half>, align 4
+  store <3 x half> %v, <3 x half>* %ptr, align 4
+  %result = load <3 x half>, <3 x half>* %ptr, align 4
+  ret <3 x half> %result
+}