11# Need https://github.com/JuliaLang/julia/pull/33970
22# and https://github.com/JuliaLang/julia/pull/34043
3- if VERSION >= v " 1.4 .0-DEV.666 " && capability (device ()) >= v " 7.0"
3+ if VERSION >= v " 1.5 .0-DEV.437 " && capability (device ()) >= v " 7.0"
44
55using CUDAnative. WMMA
66
7- is_debug = ccall (:jl_is_debugbuild , Cint, ()) != 0
8- (is_debug && VERSION < v " 1.5.0-DEV.437" ) ? @warn (" Skipping WMMA tests due to incompatible Julia" ) : @testset " WMMA" begin
7+ @testset " WMMA" begin
98
109# ###############################################################################
1110
@@ -231,20 +230,18 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
231230 return
232231 end
233232
234- @test_broken_if VERSION >= v " 1.5.0-DEV.393" begin
235- @cuda threads= 32 kernel (a_dev, b_dev, c_dev, d_dev, alpha, beta)
236- d = Array (d_dev)
233+ @cuda threads= 32 kernel (a_dev, b_dev, c_dev, d_dev, alpha, beta)
234+ d = Array (d_dev)
237235
238- new_a = (a_layout == ColMajor) ? a : transpose (a)
239- new_b = (b_layout == ColMajor) ? b : transpose (b)
240- new_c = (c_layout == ColMajor) ? c : transpose (c)
241- new_d = (d_layout == ColMajor) ? d : transpose (d)
236+ new_a = (a_layout == ColMajor) ? a : transpose (a)
237+ new_b = (b_layout == ColMajor) ? b : transpose (b)
238+ new_c = (c_layout == ColMajor) ? c : transpose (c)
239+ new_d = (d_layout == ColMajor) ? d : transpose (d)
242240
243- if do_mac
244- all (isapprox .(alpha * new_a * new_b + beta * new_c, new_d; rtol= sqrt (eps (Float16))))
245- else
246- all (isapprox .(alpha * new_a * new_b, new_d; rtol= sqrt (eps (Float16))))
247- end
241+ if do_mac
242+ @test_broken all (isapprox .(alpha * new_a * new_b + beta * new_c, new_d; rtol= sqrt (eps (Float16))))
243+ else
244+ @test_broken all (isapprox .(alpha * new_a * new_b, new_d; rtol= sqrt (eps (Float16))))
248245 end
249246 end
250247
@@ -254,40 +251,38 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
254251
255252# Need https://github.com/JuliaLang/julia/pull/34760
256253# See https://github.com/JuliaGPU/CUDAnative.jl/issues/548
257- if VERSION >= v " 1.5.0-DEV.324"
258- @testset " Codegen addressing" begin
259- @testset " Global" begin
260- function kernel (d)
261- conf = WMMA. Config{16 , 16 , 16 , Float32}
262-
263- d_frag = WMMA. fill_c (Float32 (0 ), conf)
264- WMMA. store_d (pointer (d), d_frag, 16 , WMMA. ColMajor, conf)
265-
266- return
267- end
254+ @testset " Codegen addressing" begin
255+ @testset " Global" begin
256+ function kernel (d)
257+ conf = WMMA. Config{16 , 16 , 16 , Float32}
268258
269- ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, (CuDeviceArray{Float32,1 ,CUDAnative. AS. Global},)))
259+ d_frag = WMMA. fill_c (Float32 (0 ), conf)
260+ WMMA. store_d (pointer (d), d_frag, 16 , WMMA. ColMajor, conf)
270261
271- @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
272- @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32" , ptx)
262+ return
273263 end
274264
275- @testset " Shared" begin
276- function kernel ()
277- shmem = @cuStaticSharedMem (Float32, (16 , 16 ))
278- conf = WMMA. Config{16 , 16 , 16 , Float32}
265+ ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, (CuDeviceArray{Float32,1 ,CUDAnative. AS. Global},)))
279266
280- d_frag = WMMA. fill_c (Float32 (0 ), conf)
281- WMMA. store_d (pointer (shmem), d_frag, 16 , WMMA. ColMajor, conf)
267+ @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
268+ @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32" , ptx)
269+ end
282270
283- return
284- end
271+ @testset " Shared" begin
272+ function kernel ()
273+ shmem = @cuStaticSharedMem (Float32, (16 , 16 ))
274+ conf = WMMA. Config{16 , 16 , 16 , Float32}
285275
286- ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, ()))
276+ d_frag = WMMA. fill_c (Float32 (0 ), conf)
277+ WMMA. store_d (pointer (shmem), d_frag, 16 , WMMA. ColMajor, conf)
287278
288- @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
289- @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32" , ptx)
279+ return
290280 end
281+
282+ ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, ()))
283+
284+ @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
285+ @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32" , ptx)
291286 end
292287end
293288
0 commit comments