diff --git a/test/staticsize.jl b/test/staticsize.jl index 6ff2b0f6..e0902f43 100644 --- a/test/staticsize.jl +++ b/test/staticsize.jl @@ -135,3 +135,71 @@ end @test sum2_10turbo(A) ≈ sum(A) end end + +# Test for Issue #543: W=1 nested VecUnroll store on ARM +# This tests the case where vector width is 1 (scalar) with nested unrolling +function issue543_noavx!(data_out, matrix, data_in) + for j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1) + res = zero(eltype(data_out)) + for jj in axes(matrix, 2) + res += matrix[j, jj] * data_in[v, i, jj] + end + data_out[v, i, j] = res + end + return nothing +end + +function issue543_turbo!(data_out, matrix, data_in) + @turbo for j in axes(data_out, 3), i in axes(data_out, 2), v in axes(data_out, 1) + res = zero(eltype(data_out)) + for jj in axes(matrix, 2) + res += matrix[j, jj] * data_in[v, i, jj] + end + data_out[v, i, j] = res + end + return nothing +end + +@testset "Issue #543: W=1 Nested VecUnroll" begin + # Test with static first dimension + for v in 1:4, n in 2:8 + data_out_ref = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n)) + data_out_turbo = StrideArray(undef, StaticInt(v), StaticInt(n), StaticInt(n)) + matrix = StrideArray(undef, StaticInt(n), StaticInt(n)) + data_in = rand(v, n, n) + + matrix .= rand.() + + fill!(data_out_ref, 0.0) + fill!(data_out_turbo, 0.0) + + issue543_noavx!(data_out_ref, matrix, data_in) + + # This is broken on Apple ARM CPUs (Apple M series) for some reason. + # TODO: Fix the underlying issue! + if (v == 1) && Sys.isapple() && Sys.ARCH == :aarch64 + @test_skip issue543_turbo!(data_out_turbo, matrix, data_in) + else + @test_nowarn issue543_turbo!(data_out_turbo, matrix, data_in) + @test data_out_turbo ≈ data_out_ref + end + end + + # Test with non-static first but static other dimensions + for v in 1:4, n in 2:8 + data_out_ref = StrideArray(undef, v, StaticInt(n), StaticInt(n)) + data_out_turbo = StrideArray(undef, v, StaticInt(n), StaticInt(n)) + matrix = StrideArray(undef, StaticInt(n), StaticInt(n)) + data_in = rand(v, n, n) + + matrix .= rand.() + + fill!(data_out_ref, 0.0) + fill!(data_out_turbo, 0.0) + + issue543_noavx!(data_out_ref, matrix, data_in) + issue543_turbo!(data_out_turbo, matrix, data_in) + + @test data_out_turbo ≈ data_out_ref + end +end