diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index e3a8998372..5bb7839772 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -853,6 +853,20 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, StoredVal->getType()->isArrayTy()) return false; + // HLSL Change Begin - Don't coerce types that have padding in the data + // layout (e.g., min precision types where f16:32 means half is stored in 32 + // bits). The coercion creates bitcasts between the LLVM type (based on + // primitive bit width) and an integer type (based on padded store size), + // which will fail when they differ. + Type *StoredValTy = StoredVal->getType(); + uint64_t StoredPrimBits = StoredValTy->getPrimitiveSizeInBits(); + uint64_t LoadPrimBits = LoadTy->getPrimitiveSizeInBits(); + if (StoredPrimBits && DL.getTypeSizeInBits(StoredValTy) != StoredPrimBits) + return false; + if (LoadPrimBits && DL.getTypeSizeInBits(LoadTy) != LoadPrimBits) + return false; + // HLSL Change End + // The store has to be at least as big as the load. if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy)) @@ -1942,6 +1956,21 @@ bool GVN::processLoad(LoadInst *L) { if (StoreInst *DepSI = dyn_cast(DepInst)) { Value *StoredVal = DepSI->getValueOperand(); + // HLSL Change Begin - Don't forward stores of padded types when the load + // type differs (e.g., min precision vectors where i16:32/f16:32 means + // elements are padded to 32 bits). BasicAA returns MustAlias at offset 0 + // regardless of access sizes, so a partial element store can appear as a + // Def to MemoryDependence. CanCoerceMustAliasedValueToLoad catches this, + // but we add a defense-in-depth check here for cross-type forwarding of + // padded types. Same-type forwarding is always safe. + if (StoredVal->getType() != L->getType()) { + Type *StoredTy = StoredVal->getType(); + uint64_t StoredPrimBits = StoredTy->getPrimitiveSizeInBits(); + if (StoredPrimBits && DL.getTypeSizeInBits(StoredTy) != StoredPrimBits) + return false; + } + // HLSL Change End + // The store and load are to a must-aliased pointer, but they may not // actually have the same type. See if we know how to reuse the stored // value (depending on its type). diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index 0c598b9563..7f261fcfe3 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -1671,7 +1671,12 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL, // extremely poorly defined currently. The long-term goal is to remove GEPing // over a vector from the IR completely. if (VectorType *VecTy = dyn_cast(Ty)) { - unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType()); + // HLSL Change: Use alloc size instead of primitive type size for vector + // elements. DXC's data layout pads min precision types (i16:32, f16:32), + // so getTypeAllocSize matches the GEP offset stride while + // getTypeSizeInBits returns the unpadded primitive width. + unsigned ElementSizeInBits = + DL.getTypeAllocSizeInBits(VecTy->getScalarType()); if (ElementSizeInBits % 8 != 0) { // GEPs over non-multiple of 8 size vector elements are invalid. return nullptr; @@ -2134,7 +2139,8 @@ static VectorType *isVectorPromotionViable(AllocaSlices::Partition &P, // Try each vector type, and return the one which works. auto CheckVectorTypeForPromotion = [&](VectorType *VTy) { - uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()); + // HLSL Change: Use alloc size to match GEP offset stride for padded types. + uint64_t ElementSize = DL.getTypeAllocSizeInBits(VTy->getElementType()); // While the definition of LLVM vectors is bitpacked, we don't support sizes // that aren't byte sized. @@ -2492,12 +2498,14 @@ class AllocaSliceRewriter : public InstVisitor { : nullptr), VecTy(PromotableVecTy), ElementTy(VecTy ? VecTy->getElementType() : nullptr), - ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0), + // HLSL Change: Use alloc size to match GEP offset stride for padded + // types. + ElementSize(VecTy ? DL.getTypeAllocSizeInBits(ElementTy) / 8 : 0), BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(), OldPtr(), PHIUsers(PHIUsers), SelectUsers(SelectUsers), IRB(NewAI.getContext(), ConstantFolder()) { if (VecTy) { - assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 && + assert((DL.getTypeAllocSizeInBits(ElementTy) % 8) == 0 && "Only multiple-of-8 sized vector elements are viable"); ++NumVectorized; } diff --git a/test/Transforms/GVN/min-precision-padding.ll b/test/Transforms/GVN/min-precision-padding.ll new file mode 100644 index 0000000000..cdb51d9ce1 --- /dev/null +++ b/test/Transforms/GVN/min-precision-padding.ll @@ -0,0 +1,153 @@ +; RUN: opt < %s -basicaa -gvn -S | FileCheck %s + +; Regression test for min precision vector GVN miscompilation. +; DXC's data layout pads i16 to 32 bits (i16:32). GVN must not: +; 1. Coerce padded vector types via bitcast (CanCoerceMustAliasedValueToLoad) +; 2. Forward a zeroinitializer store past partial element stores (processLoad) +; +; Without the fix, GVN would forward the zeroinitializer vector load, producing +; incorrect all-zero results for elements that were individually written. + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +; Test 1: GVN must not forward zeroinitializer past element store for <3 x i16>. +; The store of zeroinitializer to %dst is followed by an element store to +; %dst[0], then a vector load of %dst. GVN must not replace the vector load +; with the zeroinitializer. + +; CHECK-LABEL: @test_no_forward_i16_vec3 +; CHECK: store <3 x i16> zeroinitializer +; CHECK: store i16 %val +; The vector load must survive — GVN must not replace it with zeroinitializer. +; CHECK: %result = load <3 x i16> +; CHECK: ret <3 x i16> %result +define <3 x i16> @test_no_forward_i16_vec3(i16 %val) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %elem0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 + store i16 %val, i16* %elem0, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 2: Same pattern with <3 x half> (f16:32 padding). + +; CHECK-LABEL: @test_no_forward_f16_vec3 +; CHECK: store <3 x half> zeroinitializer +; CHECK: store half %val +; CHECK: %result = load <3 x half> +; CHECK: ret <3 x half> %result +define <3 x half> @test_no_forward_f16_vec3(half %val) { +entry: + %dst = alloca <3 x half>, align 4 + store <3 x half> zeroinitializer, <3 x half>* %dst, align 4 + %elem0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0 + store half %val, half* %elem0, align 4 + %result = load <3 x half>, <3 x half>* %dst, align 4 + ret <3 x half> %result +} + +; Test 3: Multiple element stores — all must survive. +; Stores to elements 0, 1, 2 of a <3 x i16> vector after zeroinitializer. + +; CHECK-LABEL: @test_no_forward_i16_vec3_all_elems +; CHECK: store <3 x i16> zeroinitializer +; CHECK: store i16 %v0 +; CHECK: store i16 %v1 +; CHECK: store i16 %v2 +; CHECK: %result = load <3 x i16> +; CHECK: ret <3 x i16> %result +define <3 x i16> @test_no_forward_i16_vec3_all_elems(i16 %v0, i16 %v1, i16 %v2) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 + store i16 %v0, i16* %e0, align 4 + %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 + store i16 %v1, i16* %e1, align 4 + %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 + store i16 %v2, i16* %e2, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 4: Coercion rejection — store a <3 x i16> vector, load as different type. +; GVN must not attempt bitcast coercion on padded types. + +; CHECK-LABEL: @test_no_coerce_i16_vec3 +; CHECK: store <3 x i16> +; The load of a different type from the same pointer must not be coerced. +; CHECK: load +; CHECK: ret +define i96 @test_no_coerce_i16_vec3(<3 x i16> %v) { +entry: + %ptr = alloca <3 x i16>, align 4 + store <3 x i16> %v, <3 x i16>* %ptr, align 4 + %iptr = bitcast <3 x i16>* %ptr to i96* + %result = load i96, i96* %iptr, align 4 + ret i96 %result +} + +; Test 5: Long vector variant — <5 x i16> (exceeds 4-element native size). + +; CHECK-LABEL: @test_no_forward_i16_vec5 +; CHECK: store <5 x i16> zeroinitializer +; CHECK: store i16 %val +; CHECK: %result = load <5 x i16> +; CHECK: ret <5 x i16> %result +define <5 x i16> @test_no_forward_i16_vec5(i16 %val) { +entry: + %dst = alloca <5 x i16>, align 4 + store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4 + %elem0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0 + store i16 %val, i16* %elem0, align 4 + %result = load <5 x i16>, <5 x i16>* %dst, align 4 + ret <5 x i16> %result +} + +; Test 6: Long vector variant — <8 x half>. + +; CHECK-LABEL: @test_no_forward_f16_vec8 +; CHECK: store <8 x half> zeroinitializer +; CHECK: store half %val +; CHECK: %result = load <8 x half> +; CHECK: ret <8 x half> %result +define <8 x half> @test_no_forward_f16_vec8(half %val) { +entry: + %dst = alloca <8 x half>, align 4 + store <8 x half> zeroinitializer, <8 x half>* %dst, align 4 + %elem0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0 + store half %val, half* %elem0, align 4 + %result = load <8 x half>, <8 x half>* %dst, align 4 + ret <8 x half> %result +} + +; Test 7: Same-type store-to-load forwarding must still work for padded types. +; GVN should forward %v directly — no intervening writes, same type. + +; CHECK-LABEL: @test_same_type_forward_i16_vec3 +; The load should be eliminated and %v returned directly. +; CHECK-NOT: load +; CHECK: ret <3 x i16> %v +define <3 x i16> @test_same_type_forward_i16_vec3(<3 x i16> %v) { +entry: + %ptr = alloca <3 x i16>, align 4 + store <3 x i16> %v, <3 x i16>* %ptr, align 4 + %result = load <3 x i16>, <3 x i16>* %ptr, align 4 + ret <3 x i16> %result +} + +; Test 8: Same-type forwarding for <3 x half>. + +; CHECK-LABEL: @test_same_type_forward_f16_vec3 +; CHECK-NOT: load +; CHECK: ret <3 x half> %v +define <3 x half> @test_same_type_forward_f16_vec3(<3 x half> %v) { +entry: + %ptr = alloca <3 x half>, align 4 + store <3 x half> %v, <3 x half>* %ptr, align 4 + %result = load <3 x half>, <3 x half>* %ptr, align 4 + ret <3 x half> %result +} diff --git a/test/Transforms/SROA/min-precision-padding.ll b/test/Transforms/SROA/min-precision-padding.ll new file mode 100644 index 0000000000..b431b5ea07 --- /dev/null +++ b/test/Transforms/SROA/min-precision-padding.ll @@ -0,0 +1,154 @@ +; RUN: opt < %s -sroa -S | FileCheck %s + +; Regression test for SROA miscompilation of min precision vector element access. +; DXC's data layout pads i16 to 32 bits (i16:32) and f16 to 32 bits (f16:32). +; SROA must use getTypeAllocSizeInBits (not getTypeSizeInBits) for vector element +; stride so that GEP byte offsets map to correct element indices. +; +; Without the fix, SROA used getTypeSizeInBits(i16)=16 bits (2 bytes) for the +; element stride, but GEP offsets use getTypeAllocSize(i16)=4 bytes. So byte +; offset 4 (element 1) was mapped to index 4/2=2 instead of 4/4=1, causing +; element stores to be misplaced or eliminated. + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +; Test 1: Element-wise write to <3 x i16> vector. +; SROA must map GEP byte offsets to correct element indices using alloc size +; (4 bytes per i16), not primitive size (2 bytes). All stores must survive +; with correct indices, and the final vector load must be preserved. + +; CHECK-LABEL: @test_sroa_i16_vec3 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 0 +; CHECK: store i16 %v0 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1 +; CHECK: store i16 %v1 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2 +; CHECK: store i16 %v2 +; CHECK: load <3 x i16> +; CHECK: ret <3 x i16> +define <3 x i16> @test_sroa_i16_vec3(i16 %v0, i16 %v1, i16 %v2) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e0 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 0 + store i16 %v0, i16* %e0, align 4 + %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 + store i16 %v1, i16* %e1, align 4 + %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 + store i16 %v2, i16* %e2, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 2: Same pattern with <3 x half> (f16:32 padding). + +; CHECK-LABEL: @test_sroa_f16_vec3 +; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 0 +; CHECK: store half %v0 +; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 1 +; CHECK: store half %v1 +; CHECK: getelementptr inbounds <3 x half>, <3 x half>* %{{.*}}, i32 0, i32 2 +; CHECK: store half %v2 +; CHECK: load <3 x half> +; CHECK: ret <3 x half> +define <3 x half> @test_sroa_f16_vec3(half %v0, half %v1, half %v2) { +entry: + %dst = alloca <3 x half>, align 4 + store <3 x half> zeroinitializer, <3 x half>* %dst, align 4 + %e0 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 0 + store half %v0, half* %e0, align 4 + %e1 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 1 + store half %v1, half* %e1, align 4 + %e2 = getelementptr inbounds <3 x half>, <3 x half>* %dst, i32 0, i32 2 + store half %v2, half* %e2, align 4 + %result = load <3 x half>, <3 x half>* %dst, align 4 + ret <3 x half> %result +} + +; Test 3: Partial write — only element 1 is stored. SROA must index it correctly. + +; CHECK-LABEL: @test_sroa_i16_vec3_elem1 +; Element 1 store must be correctly placed at GEP index 1, not index 2. +; Without the fix, byte offset 4 / prim_size 2 = index 2 (wrong). +; With the fix, byte offset 4 / alloc_size 4 = index 1 (correct). +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 1 +; CHECK: store i16 %val +; CHECK: load <3 x i16> +; CHECK: ret <3 x i16> +define <3 x i16> @test_sroa_i16_vec3_elem1(i16 %val) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e1 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 1 + store i16 %val, i16* %e1, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 4: Element 2 store — verifies highest index is correct. + +; CHECK-LABEL: @test_sroa_i16_vec3_elem2 +; CHECK: getelementptr inbounds <3 x i16>, <3 x i16>* %{{.*}}, i32 0, i32 2 +; CHECK: store i16 %val +; CHECK: load <3 x i16> +; CHECK: ret <3 x i16> +define <3 x i16> @test_sroa_i16_vec3_elem2(i16 %val) { +entry: + %dst = alloca <3 x i16>, align 4 + store <3 x i16> zeroinitializer, <3 x i16>* %dst, align 4 + %e2 = getelementptr inbounds <3 x i16>, <3 x i16>* %dst, i32 0, i32 2 + store i16 %val, i16* %e2, align 4 + %result = load <3 x i16>, <3 x i16>* %dst, align 4 + ret <3 x i16> %result +} + +; Test 5: Long vector — <5 x i16> (exceeds 4-element native size). + +; CHECK-LABEL: @test_sroa_i16_vec5 +; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 0 +; CHECK: store i16 %v0 +; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 1 +; CHECK: store i16 %v1 +; CHECK: getelementptr inbounds <5 x i16>, <5 x i16>* %{{.*}}, i32 0, i32 4 +; CHECK: store i16 %v4 +; CHECK: load <5 x i16> +; CHECK: ret <5 x i16> +define <5 x i16> @test_sroa_i16_vec5(i16 %v0, i16 %v1, i16 %v2, i16 %v3, i16 %v4) { +entry: + %dst = alloca <5 x i16>, align 4 + store <5 x i16> zeroinitializer, <5 x i16>* %dst, align 4 + %e0 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 0 + store i16 %v0, i16* %e0, align 4 + %e1 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 1 + store i16 %v1, i16* %e1, align 4 + %e2 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 2 + store i16 %v2, i16* %e2, align 4 + %e3 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 3 + store i16 %v3, i16* %e3, align 4 + %e4 = getelementptr inbounds <5 x i16>, <5 x i16>* %dst, i32 0, i32 4 + store i16 %v4, i16* %e4, align 4 + %result = load <5 x i16>, <5 x i16>* %dst, align 4 + ret <5 x i16> %result +} + +; Test 6: Long vector — <8 x half>. + +; CHECK-LABEL: @test_sroa_f16_vec8_partial +; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 0 +; CHECK: store half %v0 +; CHECK: getelementptr inbounds <8 x half>, <8 x half>* %{{.*}}, i32 0, i32 7 +; CHECK: store half %v7 +; CHECK: load <8 x half> +; CHECK: ret <8 x half> +define <8 x half> @test_sroa_f16_vec8_partial(half %v0, half %v7) { +entry: + %dst = alloca <8 x half>, align 4 + store <8 x half> zeroinitializer, <8 x half>* %dst, align 4 + %e0 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 0 + store half %v0, half* %e0, align 4 + %e7 = getelementptr inbounds <8 x half>, <8 x half>* %dst, i32 0, i32 7 + store half %v7, half* %e7, align 4 + %result = load <8 x half>, <8 x half>* %dst, align 4 + ret <8 x half> %result +}