Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions lib/Transforms/Scalar/GVN.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,20 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
StoredVal->getType()->isArrayTy())
return false;

// HLSL Change Begin - Don't coerce types that have padding in the data
// layout (e.g., min precision types where f16:32 means half is stored in 32
// bits). The coercion creates bitcasts between the LLVM type (based on
// primitive bit width) and an integer type (based on padded store size),
// which will fail when they differ.
Type *StoredValTy = StoredVal->getType();
uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits();
uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits();
if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits)
return false;
if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits)
return false;
// HLSL Change End

// The store has to be at least as big as the load.
if (DL.getTypeSizeInBits(StoredVal->getType()) <
DL.getTypeSizeInBits(LoadTy))
Expand Down Expand Up @@ -1942,6 +1956,21 @@ bool GVN::processLoad(LoadInst *L) {
if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
Value *StoredVal = DepSI->getValueOperand();

// HLSL Change Begin - Don't forward stores of padded types when the load
// type differs (e.g., min precision vectors where i16:32/f16:32 means
// elements are padded to 32 bits). BasicAA returns MustAlias at offset 0
// regardless of access sizes, so a partial element store can appear as a
// Def to MemoryDependence. CanCoerceMustAliasedValueToLoad catches this,
// but we add a defense-in-depth check here for cross-type forwarding of
// padded types. Same-type forwarding is always safe.
if (StoredVal->getType() != L->getType()) {
Type *StoredTy = StoredVal->getType();
uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits();
if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits)
return false;
}
// HLSL Change End

// The store and load are to a must-aliased pointer, but they may not
// actually have the same type. See if we know how to reuse the stored
// value (depending on its type).
Expand Down
16 changes: 12 additions & 4 deletions lib/Transforms/Scalar/SROA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1671,7 +1671,12 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
// extremely poorly defined currently. The long-term goal is to remove GEPing
// over a vector from the IR completely.
if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
// HLSL Change: Use alloc size instead of primitive type size for vector
// elements. DXC's data layout pads min precision types (i16:32, f16:32),
// so getTypeAllocSize matches the GEP offset stride while
// getTypeSizeInBits returns the unpadded primitive width.
unsigned ElementSizeInBits =
DL.getTypeAllocSizeInBits(VecTy->getScalarType());
if (ElementSizeInBits % 8 != 0) {
// GEPs over non-multiple of 8 size vector elements are invalid.
return nullptr;
Expand Down Expand Up @@ -2134,7 +2139,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P,

// Try each vector type, and return the one which works.
auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
// HLSL Change: Use alloc size to match GEP offset stride for padded types.
uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType());

// While the definition of LLVM vectors is bitpacked, we don't support sizes
// that aren't byte sized.
Expand Down Expand Up @@ -2492,12 +2498,14 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
: nullptr),
VecTy(PromotableVecTy),
ElementTy(VecTy ? VecTy->getElementType() : nullptr),
ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
// HLSL Change: Use alloc size to match GEP offset stride for padded
// types.
ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0),
BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers),
IRB(NewAI.getContext(), ConstantFolder()) {
if (VecTy) {
assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 &&
"Only multiple-of-8 sized vector elements are viable");
++NumVectorized;
}
Expand Down
153 changes: 153 additions & 0 deletions test/Transforms/GVN/min-precision-padding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
; RUN: opt < %s -basicaa -gvn -S | FileCheck %s

; Regression test for min precision vector GVN miscompilation.
; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not:
; 1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad)
; 2. Forward a zeroinitializer store past partial element stores (processLoad)
;
; Without the fix, GVN would forward the zeroinitializer vector load, producing
; incorrect all-zero results for elements that were individually written.

target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
target triple = "dxil-ms-dx"

; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>.
; The store of zeroinitializer to %dst is followed by an element store to
; %dst[0], then a vector load of %dst. GVN must not replace the vector load
; with the zeroinitializer.

; CHECK-LABEL: @test_no_forward_i16_vec3
; CHECK: store <3 x i16> zeroinitializer
; CHECK: store i16 %val
; The vector load must survive — GVN must not replace it with zeroinitializer.
; CHECK: %result = load <3 x i16>
; CHECK: ret <3 x i16> %result
define <3 x i16> @test_no_forward_i16_vec3(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %val, i16* %elem0, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 2: Same pattern with <3 x half> (f16:32 padding).

; CHECK-LABEL: @test_no_forward_f16_vec3
; CHECK: store <3 x half> zeroinitializer
; CHECK: store half %val
; CHECK: %result = load <3 x half>
; CHECK: ret <3 x half> %result
define <3 x half> @test_no_forward_f16_vec3(half %val) {
entry:
%dst = alloca <3 x half>, align 4
store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
%elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
store half %val, half* %elem0, align 4
%result = load <3 x half>, <3 x half>* %dst, align 4
ret <3 x half> %result
}

; Test 3: Multiple element stores — all must survive.
; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer.

; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems
; CHECK: store <3 x i16> zeroinitializer
; CHECK: store i16 %v0
; CHECK: store i16 %v1
; CHECK: store i16 %v2
; CHECK: %result = load <3 x i16>
; CHECK: ret <3 x i16> %result
define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type.
; GVN must not attempt bitcast coercion on padded types.

; CHECK-LABEL: @test_no_coerce_i16_vec3
; CHECK: store <3 x i16>
; The load of a different type from the same pointer must not be coerced.
; CHECK: load
; CHECK: ret
define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) {
entry:
%ptr = alloca <3 x i16>, align 4
store <3 x i16> %v, <3 x i16>* %ptr, align 4
%iptr = bitcast <3 x i16>* %ptr to i96*
%result = load i96, i96* %iptr, align 4
ret i96 %result
}

; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size).

; CHECK-LABEL: @test_no_forward_i16_vec5
; CHECK: store <5 x i16> zeroinitializer
; CHECK: store i16 %val
; CHECK: %result = load <5 x i16>
; CHECK: ret <5 x i16> %result
define <5 x i16> @test_no_forward_i16_vec5(i16 %val) {
entry:
%dst = alloca <5 x i16>, align 4
store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
%elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
store i16 %val, i16* %elem0, align 4
%result = load <5 x i16>, <5 x i16>* %dst, align 4
ret <5 x i16> %result
}

; Test 6: Long vector variant — <8 x half>.

; CHECK-LABEL: @test_no_forward_f16_vec8
; CHECK: store <8 x half> zeroinitializer
; CHECK: store half %val
; CHECK: %result = load <8 x half>
; CHECK: ret <8 x half> %result
define <8 x half> @test_no_forward_f16_vec8(half %val) {
entry:
%dst = alloca <8 x half>, align 4
store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
%elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
store half %val, half* %elem0, align 4
%result = load <8 x half>, <8 x half>* %dst, align 4
ret <8 x half> %result
}

; Test 7: Same-type store-to-load forwarding must still work for padded types.
; GVN should forward %v directly — no intervening writes, same type.

; CHECK-LABEL: @test_same_type_forward_i16_vec3
; The load should be eliminated and %v returned directly.
; CHECK-NOT: load
; CHECK: ret <3 x i16> %v
define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) {
entry:
%ptr = alloca <3 x i16>, align 4
store <3 x i16> %v, <3 x i16>* %ptr, align 4
%result = load <3 x i16>, <3 x i16>* %ptr, align 4
ret <3 x i16> %result
}

; Test 8: Same-type forwarding for <3 x half>.

; CHECK-LABEL: @test_same_type_forward_f16_vec3
; CHECK-NOT: load
; CHECK: ret <3 x half> %v
define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) {
entry:
%ptr = alloca <3 x half>, align 4
store <3 x half> %v, <3 x half>* %ptr, align 4
%result = load <3 x half>, <3 x half>* %ptr, align 4
ret <3 x half> %result
}
154 changes: 154 additions & 0 deletions test/Transforms/SROA/min-precision-padding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
; RUN: opt < %s -sroa -S | FileCheck %s

; Regression test for SROA miscompilation of min precision vector element access.
; DXC's data layout pads i16 to 32 bits (i16:32) and f16 to 32 bits (f16:32).
; SROA must use getTypeAllocSizeInBits (not getTypeSizeInBits) for vector element
; stride so that GEP byte offsets map to correct element indices.
;
; Without the fix, SROA used getTypeSizeInBits(i16)=16 bits (2 bytes) for the
; element stride, but GEP offsets use getTypeAllocSize(i16)=4 bytes. So byte
; offset 4 (element 1) was mapped to index 4/2=2 instead of 4/4=1, causing
; element stores to be misplaced or eliminated.

target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
target triple = "dxil-ms-dx"

; Test 1: Element-wise write to <3 x i16> vector.
; SROA must map GEP byte offsets to correct element indices using alloc size
; (4 bytes per i16), not primitive size (2 bytes). All stores must survive
; with correct indices, and the final vector load must be preserved.

; CHECK-LABEL: @test_sroa_i16_vec3
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0
; CHECK: store i16 %v0
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %v1
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
; CHECK: store i16 %v2
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 2: Same pattern with <3 x half> (f16:32 padding).

; CHECK-LABEL: @test_sroa_f16_vec3
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0
; CHECK: store half %v0
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1
; CHECK: store half %v1
; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2
; CHECK: store half %v2
; CHECK: load <3 x half>
; CHECK: ret <3 x half>
define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) {
entry:
%dst = alloca <3 x half>, align 4
store <3 x half> zeroinitializer, <3 x half>* %dst, align 4
%e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0
store half %v0, half* %e0, align 4
%e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1
store half %v1, half* %e1, align 4
%e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2
store half %v2, half* %e2, align 4
%result = load <3 x half>, <3 x half>* %dst, align 4
ret <3 x half> %result
}

; Test 3: Partial write — only element 1 is stored. SROA must index it correctly.

; CHECK-LABEL: @test_sroa_i16_vec3_elem1
; Element 1 store must be correctly placed at GEP index 1, not index 2.
; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong).
; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct).
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %val
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1
store i16 %val, i16* %e1, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 4: Element 2 store — verifies highest index is correct.

; CHECK-LABEL: @test_sroa_i16_vec3_elem2
; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2
; CHECK: store i16 %val
; CHECK: load <3 x i16>
; CHECK: ret <3 x i16>
define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) {
entry:
%dst = alloca <3 x i16>, align 4
store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4
%e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2
store i16 %val, i16* %e2, align 4
%result = load <3 x i16>, <3 x i16>* %dst, align 4
ret <3 x i16> %result
}

; Test 5: Long vector — <5 x i16> (exceeds 4-element native size).

; CHECK-LABEL: @test_sroa_i16_vec5
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0
; CHECK: store i16 %v0
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1
; CHECK: store i16 %v1
; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4
; CHECK: store i16 %v4
; CHECK: load <5 x i16>
; CHECK: ret <5 x i16>
define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) {
entry:
%dst = alloca <5 x i16>, align 4
store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4
%e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0
store i16 %v0, i16* %e0, align 4
%e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1
store i16 %v1, i16* %e1, align 4
%e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2
store i16 %v2, i16* %e2, align 4
%e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3
store i16 %v3, i16* %e3, align 4
%e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4
store i16 %v4, i16* %e4, align 4
%result = load <5 x i16>, <5 x i16>* %dst, align 4
ret <5 x i16> %result
}

; Test 6: Long vector — <8 x half>.

; CHECK-LABEL: @test_sroa_f16_vec8_partial
; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0
; CHECK: store half %v0
; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7
; CHECK: store half %v7
; CHECK: load <8 x half>
; CHECK: ret <8 x half>
define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) {
entry:
%dst = alloca <8 x half>, align 4
store <8 x half> zeroinitializer, <8 x half>* %dst, align 4
%e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0
store half %v0, half* %e0, align 4
%e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7
store half %v7, half* %e7, align 4
%result = load <8 x half>, <8 x half>* %dst, align 4
ret <8 x half> %result
}
Loading