From b34136b9a0466b6c6957d02de36fbb3537dc5f3b Mon Sep 17 00:00:00 2001 From: Alex Sepkowski Date: Fri, 13 Mar 2026 23:44:21 -0700 Subject: [PATCH 1/5] Fix GVN and SROA miscompilation of min precision vector element access Multiple optimization passes mishandle min precision vector types due to DXC's padded data layout (i16:32, f16:32), where getTypeSizeInBits returns padded sizes for vectors but primitive sizes for scalars. Bug 1 - GVN ICE: CanCoerceMustAliasedValueToLoad computes a padded integer type (e.g., i96 for <3 x half>) then attempts to bitcast from the 48-bit LLVM type, triggering an assert. Fix: reject coercion when type sizes include padding. Bug 2 - GVN incorrect store forwarding: processLoad forwards a 'store <3 x i16> zeroinitializer' past partial element stores because MemoryDependence uses padded sizes for aliasing. Fix: skip store-to-load forwarding for padded types. Bug 3 - SROA element misindexing: getNaturalGEPRecursively uses getTypeSizeInBits (2 bytes for i16) for element offsets while GEP uses getTypeAllocSize (4 bytes with i16:32). Byte offset 4 (element 1) maps to index 4/2=2 instead of 4/4=1, causing SROA to misplace or eliminate element stores. Fix: use getTypeAllocSizeInBits consistently for vector element sizes throughout SROA. Fixes #8268 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/Transforms/Scalar/GVN.cpp | 25 +++++++++++++++++++++++++ lib/Transforms/Scalar/SROA.cpp | 14 ++++++++++---- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index e3a8998372..5113563398 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -853,6 +853,20 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, StoredVal->getType()->isArrayTy()) return false; + // HLSL Change Begin - Don't coerce types that have padding in the data + // layout (e.g., min precision types where f16:32 means half is stored in 32 + // bits). The coercion creates bitcasts between the LLVM type (based on + // primitive bit width) and an integer type (based on padded store size), + // which will fail when they differ. + Type *StoredValTy = StoredVal->getType(); + uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits(); + uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits(); + if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits) + return false; + if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits) + return false; + // HLSL Change End + // The store has to be at least as big as the load. if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy)) @@ -1942,6 +1956,17 @@ bool GVN::processLoad(LoadInst *L) { if (StoreInst *DepSI = dyn_cast(DepInst)) { Value *StoredVal = DepSI->getValueOperand(); + // HLSL Change Begin - Don't forward stores of types with data layout + // padding (e.g., min precision vectors where i16:32/f16:32 means elements + // are padded to 32 bits). MemoryDependence may incorrectly classify + // intermediate partial stores as non-clobbering when sizes include padding, + // leading to incorrect store-to-load forwarding. + Type *StoredTy = StoredVal->getType(); + uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits(); + if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits) + return false; + // HLSL Change End + // The store and load are to a must-aliased pointer, but they may not // actually have the same type. See if we know how to reuse the stored // value (depending on its type). diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 0c598b9563..eb07eb6fde 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1671,7 +1671,11 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, // extremely poorly defined currently. The long-term goal is to remove GEPing // over a vector from the IR completely. if (VectorType *VecTy = dyn_cast(Ty)) { - unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); + // HLSL Change: Use alloc size instead of primitive type size for vector + // elements. DXC's data layout pads min precision types (i16:32, f16:32), + // so getTypeAllocSize matches the GEP offset stride while + // getTypeSizeInBits returns the unpadded primitive width. + unsigned ElementSizeInBits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()); if (ElementSizeInBits % 8 != 0) { // GEPs over non-multiple of 8 size vector elements are invalid. return nullptr; @@ -2134,7 +2138,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, // Try each vector type, and return the one which works. auto CheckVectorTypeForPromotion = [&](VectorType *VTy) { - uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()); + // HLSL Change: Use alloc size to match GEP offset stride for padded types. + uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType()); // While the definition of LLVM vectors is bitpacked, we don't support sizes // that aren't byte sized. @@ -2492,12 +2497,13 @@ class AllocaSliceRewriter : public InstVisitor { : nullptr), VecTy(PromotableVecTy), ElementTy(VecTy ? VecTy->getElementType() : nullptr), - ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), + // HLSL Change: Use alloc size to match GEP offset stride for padded types. + ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), IRB(NewAI.getContext(), ConstantFolder()) { if (VecTy) { - assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && + assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 && "Only multiple-of-8 sized vector elements are viable"); ++NumVectorized; } From 70294faac4c9234eec1df9c3f0d3b3a5c63d7db7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 14 Mar 2026 06:53:16 +0000 Subject: [PATCH 2/5] chore: autopublish 2026-03-14T06:53:16Z --- lib/Transforms/Scalar/SROA.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index eb07eb6fde..7f261fcfe3 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1675,7 +1675,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, // elements. DXC's data layout pads min precision types (i16:32, f16:32), // so getTypeAllocSize matches the GEP offset stride while // getTypeSizeInBits returns the unpadded primitive width. - unsigned ElementSizeInBits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()); + unsigned ElementSizeInBits = + DL.getTypeAllocSizeInBits(VecTy->getScalarType()); if (ElementSizeInBits % 8 != 0) { // GEPs over non-multiple of 8 size vector elements are invalid. return nullptr; @@ -2497,7 +2498,8 @@ class AllocaSliceRewriter : public InstVisitor { : nullptr), VecTy(PromotableVecTy), ElementTy(VecTy ? VecTy->getElementType() : nullptr), - // HLSL Change: Use alloc size to match GEP offset stride for padded types. + // HLSL Change: Use alloc size to match GEP offset stride for padded + // types. ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), From e48fba78bacdfd46b106c8a53c4335028cbfa378 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski Date: Mon, 16 Mar 2026 13:42:02 -0700 Subject: [PATCH 3/5] Apply clang-format to SROA.cpp HLSL Change comments Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/Transforms/Scalar/SROA.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index eb07eb6fde..7f261fcfe3 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1675,7 +1675,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, // elements. DXC's data layout pads min precision types (i16:32, f16:32), // so getTypeAllocSize matches the GEP offset stride while // getTypeSizeInBits returns the unpadded primitive width. - unsigned ElementSizeInBits = DL.getTypeAllocSizeInBits(VecTy->getScalarType()); + unsigned ElementSizeInBits = + DL.getTypeAllocSizeInBits(VecTy->getScalarType()); if (ElementSizeInBits % 8 != 0) { // GEPs over non-multiple of 8 size vector elements are invalid. return nullptr; @@ -2497,7 +2498,8 @@ class AllocaSliceRewriter : public InstVisitor { : nullptr), VecTy(PromotableVecTy), ElementTy(VecTy ? VecTy->getElementType() : nullptr), - // HLSL Change: Use alloc size to match GEP offset stride for padded types. + // HLSL Change: Use alloc size to match GEP offset stride for padded + // types. ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), From b74226c1529df5e569c6d3747dfd19e9e95c7071 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski Date: Mon, 16 Mar 2026 15:43:14 -0700 Subject: [PATCH 4/5] Add regression tests for min precision GVN and SROA fixes Add lit tests that verify GVN and SROA handle DXC's padded min precision types (i16:32, f16:32) correctly. GVN tests (test/Transforms/GVN/min-precision-padding.ll): - Verify store-to-load forwarding is blocked for padded vector types - Verify type coercion is rejected for padded types - Cover <3 x i16>, <3 x half>, <5 x i16>, <8 x half> SROA tests (test/Transforms/SROA/min-precision-padding.ll): - Verify GEP element indices use alloc size (4 bytes) not prim size (2 bytes) - Verify all element stores survive with correct indices - Cover <3 x i16>, <3 x half>, <5 x i16>, <8 x half> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test/Transforms/GVN/min-precision-padding.ll | 125 ++++++++++++++ test/Transforms/SROA/min-precision-padding.ll | 154 ++++++++++++++++++ 2 files changed, 279 insertions(+) create mode 100644 test/Transforms/GVN/min-precision-padding.ll create mode 100644 test/Transforms/SROA/min-precision-padding.ll diff --git a/test/Transforms/GVN/min-precision-padding.ll b/test/Transforms/GVN/min-precision-padding.ll new file mode 100644 index 0000000000..bcbc45ae98 --- /dev/null +++ b/test/Transforms/GVN/min-precision-padding.ll @@ -0,0 +1,125 @@ +; RUN: opt < %s -basicaa -gvn -S | FileCheck %s + +; Regression test for min precision vector GVN miscompilation. +; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not: +; 1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad) +; 2. Forward a zeroinitializer store past partial element stores (processLoad) +; +; Without the fix, GVN would forward the zeroinitializer vector load, producing +; incorrect all-zero results for elements that were individually written. + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>. +; The store of zeroinitializer to %dst is followed by an element store to +; %dst[0], then a vector load of %dst. GVN must not replace the vector load +; with the zeroinitializer. + +; CHECK-LABEL: @test_no_forward_i16_vec3 +; CHECK: store <3 x i16> zeroinitializer +; CHECK: store i16 %val +; The vector load must survive — GVN must not replace it with zeroinitializer. +; CHECK: %result = load <3 x i16> +; CHECK: ret <3 x i16> %result +define <3 x i16> @test_no_forward_i16_vec3(i16 %val) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 + store i16 %val, i16* %elem0, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 2: Same pattern with <3 x half> (f16:32 padding). + +; CHECK-LABEL: @test_no_forward_f16_vec3 +; CHECK: store <3 x half> zeroinitializer +; CHECK: store half %val +; CHECK: %result = load <3 x half> +; CHECK: ret <3 x half> %result +define <3 x half> @test_no_forward_f16_vec3(half %val) { +entry: + %dst = alloca <3 x half>, align 4 + store <3 x half> zeroinitializer, <3 x half>* %dst, align 4 + %elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0 + store half %val, half* %elem0, align 4 + %result = load <3 x half>, <3 x half>* %dst, align 4 + ret <3 x half> %result +} + +; Test 3: Multiple element stores — all must survive. +; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer. + +; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems +; CHECK: store <3 x i16> zeroinitializer +; CHECK: store i16 %v0 +; CHECK: store i16 %v1 +; CHECK: store i16 %v2 +; CHECK: %result = load <3 x i16> +; CHECK: ret <3 x i16> %result +define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 + store i16 %v0, i16* %e0, align 4 + %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 + store i16 %v1, i16* %e1, align 4 + %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 + store i16 %v2, i16* %e2, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type. +; GVN must not attempt bitcast coercion on padded types. + +; CHECK-LABEL: @test_no_coerce_i16_vec3 +; CHECK: store <3 x i16> +; The load of a different type from the same pointer must not be coerced. +; CHECK: load +; CHECK: ret +define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) { +entry: + %ptr = alloca <3 x i16>, align 4 + store <3 x i16> %v, <3 x i16>* %ptr, align 4 + %iptr = bitcast <3 x i16>* %ptr to i96* + %result = load i96, i96* %iptr, align 4 + ret i96 %result +} + +; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size). + +; CHECK-LABEL: @test_no_forward_i16_vec5 +; CHECK: store <5 x i16> zeroinitializer +; CHECK: store i16 %val +; CHECK: %result = load <5 x i16> +; CHECK: ret <5 x i16> %result +define <5 x i16> @test_no_forward_i16_vec5(i16 %val) { +entry: + %dst = alloca <5 x i16>, align 4 + store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4 + %elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0 + store i16 %val, i16* %elem0, align 4 + %result = load <5 x i16>, <5 x i16>* %dst, align 4 + ret <5 x i16> %result +} + +; Test 6: Long vector variant — <8 x half>. + +; CHECK-LABEL: @test_no_forward_f16_vec8 +; CHECK: store <8 x half> zeroinitializer +; CHECK: store half %val +; CHECK: %result = load <8 x half> +; CHECK: ret <8 x half> %result +define <8 x half> @test_no_forward_f16_vec8(half %val) { +entry: + %dst = alloca <8 x half>, align 4 + store <8 x half> zeroinitializer, <8 x half>* %dst, align 4 + %elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0 + store half %val, half* %elem0, align 4 + %result = load <8 x half>, <8 x half>* %dst, align 4 + ret <8 x half> %result +} diff --git a/test/Transforms/SROA/min-precision-padding.ll b/test/Transforms/SROA/min-precision-padding.ll new file mode 100644 index 0000000000..b431b5ea07 --- /dev/null +++ b/test/Transforms/SROA/min-precision-padding.ll @@ -0,0 +1,154 @@ +; RUN: opt < %s -sroa -S | FileCheck %s + +; Regression test for SROA miscompilation of min precision vector element access. +; DXC's data layout pads i16 to 32 bits (i16:32) and f16 to 32 bits (f16:32). +; SROA must use getTypeAllocSizeInBits (not getTypeSizeInBits) for vector element +; stride so that GEP byte offsets map to correct element indices. +; +; Without the fix, SROA used getTypeSizeInBits(i16)=16 bits (2 bytes) for the +; element stride, but GEP offsets use getTypeAllocSize(i16)=4 bytes. So byte +; offset 4 (element 1) was mapped to index 4/2=2 instead of 4/4=1, causing +; element stores to be misplaced or eliminated. + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +; Test 1: Element-wise write to <3 x i16> vector. +; SROA must map GEP byte offsets to correct element indices using alloc size +; (4 bytes per i16), not primitive size (2 bytes). All stores must survive +; with correct indices, and the final vector load must be preserved. + +; CHECK-LABEL: @test_sroa_i16_vec3 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0 +; CHECK: store i16 %v0 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1 +; CHECK: store i16 %v1 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2 +; CHECK: store i16 %v2 +; CHECK: load <3 x i16> +; CHECK: ret <3 x i16> +define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 + store i16 %v0, i16* %e0, align 4 + %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 + store i16 %v1, i16* %e1, align 4 + %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 + store i16 %v2, i16* %e2, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 2: Same pattern with <3 x half> (f16:32 padding). + +; CHECK-LABEL: @test_sroa_f16_vec3 +; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0 +; CHECK: store half %v0 +; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1 +; CHECK: store half %v1 +; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2 +; CHECK: store half %v2 +; CHECK: load <3 x half> +; CHECK: ret <3 x half> +define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) { +entry: + %dst = alloca <3 x half>, align 4 + store <3 x half> zeroinitializer, <3 x half>* %dst, align 4 + %e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0 + store half %v0, half* %e0, align 4 + %e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1 + store half %v1, half* %e1, align 4 + %e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2 + store half %v2, half* %e2, align 4 + %result = load <3 x half>, <3 x half>* %dst, align 4 + ret <3 x half> %result +} + +; Test 3: Partial write — only element 1 is stored. SROA must index it correctly. + +; CHECK-LABEL: @test_sroa_i16_vec3_elem1 +; Element 1 store must be correctly placed at GEP index 1, not index 2. +; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong). +; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct). +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1 +; CHECK: store i16 %val +; CHECK: load <3 x i16> +; CHECK: ret <3 x i16> +define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 + store i16 %val, i16* %e1, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 4: Element 2 store — verifies highest index is correct. + +; CHECK-LABEL: @test_sroa_i16_vec3_elem2 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2 +; CHECK: store i16 %val +; CHECK: load <3 x i16> +; CHECK: ret <3 x i16> +define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 + store i16 %val, i16* %e2, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 5: Long vector — <5 x i16> (exceeds 4-element native size). + +; CHECK-LABEL: @test_sroa_i16_vec5 +; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0 +; CHECK: store i16 %v0 +; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1 +; CHECK: store i16 %v1 +; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4 +; CHECK: store i16 %v4 +; CHECK: load <5 x i16> +; CHECK: ret <5 x i16> +define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) { +entry: + %dst = alloca <5 x i16>, align 4 + store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4 + %e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0 + store i16 %v0, i16* %e0, align 4 + %e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1 + store i16 %v1, i16* %e1, align 4 + %e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2 + store i16 %v2, i16* %e2, align 4 + %e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3 + store i16 %v3, i16* %e3, align 4 + %e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4 + store i16 %v4, i16* %e4, align 4 + %result = load <5 x i16>, <5 x i16>* %dst, align 4 + ret <5 x i16> %result +} + +; Test 6: Long vector — <8 x half>. + +; CHECK-LABEL: @test_sroa_f16_vec8_partial +; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0 +; CHECK: store half %v0 +; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7 +; CHECK: store half %v7 +; CHECK: load <8 x half> +; CHECK: ret <8 x half> +define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) { +entry: + %dst = alloca <8 x half>, align 4 + store <8 x half> zeroinitializer, <8 x half>* %dst, align 4 + %e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0 + store half %v0, half* %e0, align 4 + %e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7 + store half %v7, half* %e7, align 4 + %result = load <8 x half>, <8 x half>* %dst, align 4 + ret <8 x half> %result +} From 75c8dd7026ca57daf45a6c27f1e57463ce2feb67 Mon Sep 17 00:00:00 2001 From: Alex Sepkowski Date: Mon, 16 Mar 2026 16:49:16 -0700 Subject: [PATCH 5/5] Narrow GVN processLoad guard to allow same-type forwarding The processLoad padding guard was unconditionally blocking store-to-load forwarding for all padded vector types, including trivially correct same-type forwarding. Narrow the check to only fire when stored and loaded types differ, preserving defense-in-depth for cross-type forwarding while allowing valid same-type optimizations for i16/half vectors. Also fix the comment to accurately describe the root cause: BasicAA returns MustAlias at offset 0 regardless of access sizes, causing partial element stores to appear as Defs to MemoryDependence. Add test cases verifying same-type forwarding still works for padded types. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- lib/Transforms/Scalar/GVN.cpp | 22 ++++++++------- test/Transforms/GVN/min-precision-padding.ll | 28 ++++++++++++++++++++ 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 5113563398..5bb7839772 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -1956,15 +1956,19 @@ bool GVN::processLoad(LoadInst *L) { if (StoreInst *DepSI = dyn_cast(DepInst)) { Value *StoredVal = DepSI->getValueOperand(); - // HLSL Change Begin - Don't forward stores of types with data layout - // padding (e.g., min precision vectors where i16:32/f16:32 means elements - // are padded to 32 bits). MemoryDependence may incorrectly classify - // intermediate partial stores as non-clobbering when sizes include padding, - // leading to incorrect store-to-load forwarding. - Type *StoredTy = StoredVal->getType(); - uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits(); - if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits) - return false; + // HLSL Change Begin - Don't forward stores of padded types when the load + // type differs (e.g., min precision vectors where i16:32/f16:32 means + // elements are padded to 32 bits). BasicAA returns MustAlias at offset 0 + // regardless of access sizes, so a partial element store can appear as a + // Def to MemoryDependence. CanCoerceMustAliasedValueToLoad catches this, + // but we add a defense-in-depth check here for cross-type forwarding of + // padded types. Same-type forwarding is always safe. + if (StoredVal->getType() != L->getType()) { + Type *StoredTy = StoredVal->getType(); + uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits(); + if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits) + return false; + } // HLSL Change End // The store and load are to a must-aliased pointer, but they may not diff --git a/test/Transforms/GVN/min-precision-padding.ll b/test/Transforms/GVN/min-precision-padding.ll index bcbc45ae98..cdb51d9ce1 100644 --- a/test/Transforms/GVN/min-precision-padding.ll +++ b/test/Transforms/GVN/min-precision-padding.ll @@ -123,3 +123,31 @@ entry: %result = load <8 x half>, <8 x half>* %dst, align 4 ret <8 x half> %result } + +; Test 7: Same-type store-to-load forwarding must still work for padded types. +; GVN should forward %v directly — no intervening writes, same type. + +; CHECK-LABEL: @test_same_type_forward_i16_vec3 +; The load should be eliminated and %v returned directly. +; CHECK-NOT: load +; CHECK: ret <3 x i16> %v +define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) { +entry: + %ptr = alloca <3 x i16>, align 4 + store <3 x i16> %v, <3 x i16>* %ptr, align 4 + %result = load <3 x i16>, <3 x i16>* %ptr, align 4 + ret <3 x i16> %result +} + +; Test 8: Same-type forwarding for <3 x half>. + +; CHECK-LABEL: @test_same_type_forward_f16_vec3 +; CHECK-NOT: load +; CHECK: ret <3 x half> %v +define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) { +entry: + %ptr = alloca <3 x half>, align 4 + store <3 x half> %v, <3 x half>* %ptr, align 4 + %result = load <3 x half>, <3 x half>* %ptr, align 4 + ret <3 x half> %result +}