Only run WMMA tests on 1.5.

maleadt · maleadt · commit 8a9bf2e56552 · 2020-05-05T13:11:10.000+02:00
diff --git a/src/device/cuda/wmma.jl b/src/device/cuda/wmma.jl
@@ -7,9 +7,6 @@ using CUDAnative: AS, DevicePtr
 # CONSTANTS
 ################################################################################
 
-# Determines whether or not to Core.AddrSpacePtr is available
-const addrspaceptr_available = (VERSION >= v"1.5.0-DEV.324")
-
 # Maps PTX types to Julia array types
 const map_ptx_to_jl_array = Dict(
                                  "f16" => Float16,
@@ -52,7 +49,6 @@ get_frag_info(matrix, ptx_el_type) = (
 
 get_addrspace_info(addr_space) = convert(Int, map_ptx_as_to_as_ty[addr_space])
 
-if addrspaceptr_available
 @generated function Base.cconvert(::Type{Core.AddrSpacePtr{T, as}}, x::DevicePtr{T, AS}) where {T, as, AS}
     ir = "%ptr = inttoptr i64 %0 to i8 addrspace($as)*
           ret i8 addrspace($as)* %ptr"
@@ -61,7 +57,6 @@ if addrspaceptr_available
         return Base.llvmcall($ir, Core.AddrSpacePtr{T, as}, Tuple{Int64}, Base.bitcast(Int64, x))
     end
 end
-end
 
 # Fix for https://github.com/JuliaGPU/CUDAnative.jl/issues/587.
 # Instead of ccall'ing the intrinsics with NTuple{N, T} (which gets lowered to
@@ -133,7 +128,7 @@ for mat in ["a", "b", "c"],
 
     ccall_name = "extern $llvm_intr"
 
-    ptr_ty = addrspaceptr_available ? Core.AddrSpacePtr{arr_ty, addr_space_int} : Ref{arr_ty}
+    ptr_ty = Core.AddrSpacePtr{arr_ty, addr_space_int}
     struct_ty = Symbol("LLVMStruct$sz")
 
     @eval $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride))
@@ -188,7 +183,7 @@ for mat in ["d"],
     frag_types = ntuple(i -> frag_ty, sz)
     frag_vars = ntuple(i -> :(data[$i]), sz)
 
-    ptr_ty = addrspaceptr_available ? Core.AddrSpacePtr{arr_ty, addr_space_int} : Ref{arr_ty}
+    ptr_ty = Core.AddrSpacePtr{arr_ty, addr_space_int}
 
     @eval $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride)
     @eval export $func_name
diff --git a/test/device/wmma.jl b/test/device/wmma.jl
@@ -1,11 +1,10 @@
 # Need https://github.com/JuliaLang/julia/pull/33970
 # and  https://github.com/JuliaLang/julia/pull/34043
-if VERSION >= v"1.4.0-DEV.666" && capability(device()) >= v"7.0"
+if VERSION >= v"1.5.0-DEV.437" && capability(device()) >= v"7.0"
 
 using CUDAnative.WMMA
 
-is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
-(is_debug && VERSION < v"1.5.0-DEV.437") ? @warn("Skipping WMMA tests due to incompatible Julia") : @testset "WMMA" begin
+@testset "WMMA" begin
 
 ################################################################################
 
@@ -231,20 +230,18 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
                 return
             end
 
-            @test_broken_if VERSION >= v"1.5.0-DEV.393" begin
-                @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev, alpha, beta)
-                d = Array(d_dev)
+            @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev, alpha, beta)
+            d = Array(d_dev)
 
-                new_a = (a_layout == ColMajor) ? a : transpose(a)
-                new_b = (b_layout == ColMajor) ? b : transpose(b)
-                new_c = (c_layout == ColMajor) ? c : transpose(c)
-                new_d = (d_layout == ColMajor) ? d : transpose(d)
+            new_a = (a_layout == ColMajor) ? a : transpose(a)
+            new_b = (b_layout == ColMajor) ? b : transpose(b)
+            new_c = (c_layout == ColMajor) ? c : transpose(c)
+            new_d = (d_layout == ColMajor) ? d : transpose(d)
 
-                if do_mac
-                    all(isapprox.(alpha * new_a * new_b + beta * new_c, new_d; rtol=sqrt(eps(Float16))))
-                else
-                    all(isapprox.(alpha * new_a * new_b, new_d; rtol=sqrt(eps(Float16))))
-                end
+            if do_mac
+                @test_broken all(isapprox.(alpha * new_a * new_b + beta * new_c, new_d; rtol=sqrt(eps(Float16))))
+            else
+                @test_broken all(isapprox.(alpha * new_a * new_b, new_d; rtol=sqrt(eps(Float16))))
             end
         end
 
@@ -254,40 +251,38 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
 
 # Need https://github.com/JuliaLang/julia/pull/34760
 # See https://github.com/JuliaGPU/CUDAnative.jl/issues/548
-if VERSION >= v"1.5.0-DEV.324"
-    @testset "Codegen addressing" begin
-        @testset "Global" begin
-            function kernel(d)
-                conf = WMMA.Config{16, 16, 16, Float32}
-
-                d_frag = WMMA.fill_c(Float32(0), conf)
-                WMMA.store_d(pointer(d), d_frag, 16, WMMA.ColMajor, conf)
-
-                return
-            end
+@testset "Codegen addressing" begin
+    @testset "Global" begin
+        function kernel(d)
+            conf = WMMA.Config{16, 16, 16, Float32}
 
-            ptx = sprint(io -> CUDAnative.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDAnative.AS.Global},)))
+            d_frag = WMMA.fill_c(Float32(0), conf)
+            WMMA.store_d(pointer(d), d_frag, 16, WMMA.ColMajor, conf)
 
-            @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
-            @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
+            return
         end
 
-        @testset "Shared" begin
-            function kernel()
-                shmem = @cuStaticSharedMem(Float32, (16, 16))
-                conf = WMMA.Config{16, 16, 16, Float32}
+        ptx = sprint(io -> CUDAnative.code_ptx(io, kernel, (CuDeviceArray{Float32,1,CUDAnative.AS.Global},)))
 
-                d_frag = WMMA.fill_c(Float32(0), conf)
-                WMMA.store_d(pointer(shmem), d_frag, 16, WMMA.ColMajor, conf)
+        @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
+        @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32", ptx)
+    end
 
-                return
-            end
+    @testset "Shared" begin
+        function kernel()
+            shmem = @cuStaticSharedMem(Float32, (16, 16))
+            conf = WMMA.Config{16, 16, 16, Float32}
 
-            ptx = sprint(io -> CUDAnative.code_ptx(io, kernel, ()))
+            d_frag = WMMA.fill_c(Float32(0), conf)
+            WMMA.store_d(pointer(shmem), d_frag, 16, WMMA.ColMajor, conf)
 
-            @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
-            @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx)
+            return
         end
+
+        ptx = sprint(io -> CUDAnative.code_ptx(io, kernel, ()))
+
+        @test !occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.f32", ptx)
+        @test  occursin(r"wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32", ptx)
     end
 end
 
diff --git a/test/util.jl b/test/util.jl
@@ -91,14 +91,3 @@ function julia_script(code, args=``)
     wait(proc)
     proc.exitcode, read(out, String), read(err, String)
 end
-
-# tests that are conditionall broken
-macro test_broken_if(cond, ex...)
-    quote
-        if $(esc(cond))
-            @test_broken $(map(esc, ex)...)
-        else
-            @test $(map(esc, ex)...)
-        end
-    end
-end