diff --git a/src/device/cuda.jl b/src/device/cuda.jl index 4e2772ad..758999a0 100644 --- a/src/device/cuda.jl +++ b/src/device/cuda.jl @@ -11,7 +11,9 @@ include("cuda/assertion.jl") include("cuda/memory_dynamic.jl") include("cuda/atomics.jl") include("cuda/misc.jl") +if VERSION >= v"1.4.1" include("cuda/wmma.jl") +end # functionality from libdevice # diff --git a/src/device/cuda/memory_shared.jl b/src/device/cuda/memory_shared.jl index c19d5692..99ed6e03 100644 --- a/src/device/cuda/memory_shared.jl +++ b/src/device/cuda/memory_shared.jl @@ -61,7 +61,6 @@ end end T_ptr = convert(LLVMType, DevicePtr{T,AS.Shared}) - T_actual_ptr = LLVM.PointerType(eltyp) # create a function llvm_f, _ = create_function(T_ptr) @@ -92,10 +91,9 @@ end entry = BasicBlock(llvm_f, "entry", JuliaContext()) position!(builder, entry) - ptr_with_as = gep!(builder, gv, [ConstantInt(0, JuliaContext()), - ConstantInt(0, JuliaContext())]) + ptr = gep!(builder, gv, [ConstantInt(0, JuliaContext()), + ConstantInt(0, JuliaContext())]) - ptr = addrspacecast!(builder, ptr_with_as, T_actual_ptr) val = ptrtoint!(builder, ptr, T_ptr) ret!(builder, val) end diff --git a/src/device/cuda/wmma.jl b/src/device/cuda/wmma.jl index b8c75571..088166c8 100644 --- a/src/device/cuda/wmma.jl +++ b/src/device/cuda/wmma.jl @@ -7,9 +7,6 @@ using CUDAnative: AS, DevicePtr # CONSTANTS ################################################################################ -# Determines whether or not to Core.AddrSpacePtr is available -const addrspaceptr_available = (VERSION >= v"1.5.0-DEV.324") - # Maps PTX types to Julia array types const map_ptx_to_jl_array = Dict( "f16" => Float16, @@ -52,24 +49,14 @@ get_frag_info(matrix, ptx_el_type) = ( get_addrspace_info(addr_space) = convert(Int, map_ptx_as_to_as_ty[addr_space]) -if addrspaceptr_available @generated function Base.cconvert(::Type{Core.AddrSpacePtr{T, as}}, x::DevicePtr{T, AS}) where {T, as, AS} - # Addrspacecast from i8* to i8* is invalid in LLVM - if as == 0 - return quote - return Base.bitcast(Core.AddrSpacePtr{T, as}, x) - end - else - ir = "%p = inttoptr i64 %0 to i8* - %ptr = addrspacecast i8* %p to i8 addrspace($as)* - ret i8 addrspace($as)* %ptr" + ir = "%ptr = inttoptr i64 %0 to i8 addrspace($as)* + ret i8 addrspace($as)* %ptr" - return quote - return Base.llvmcall($ir, Core.AddrSpacePtr{T, as}, Tuple{Int64}, Base.bitcast(Int64, x)) - end + return quote + return Base.llvmcall($ir, Core.AddrSpacePtr{T, as}, Tuple{Int64}, Base.bitcast(Int64, x)) end end -end # Fix for https://github.com/JuliaGPU/CUDAnative.jl/issues/587. # Instead of ccall'ing the intrinsics with NTuple{N, T} (which gets lowered to @@ -141,7 +128,7 @@ for mat in ["a", "b", "c"], ccall_name = "extern $llvm_intr" - ptr_ty = addrspaceptr_available ? Core.AddrSpacePtr{arr_ty, addr_space_int} : Ref{arr_ty} + ptr_ty = Core.AddrSpacePtr{arr_ty, addr_space_int} struct_ty = Symbol("LLVMStruct$sz") @eval $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride)) @@ -196,7 +183,7 @@ for mat in ["d"], frag_types = ntuple(i -> frag_ty, sz) frag_vars = ntuple(i -> :(data[$i]), sz) - ptr_ty = addrspaceptr_available ? Core.AddrSpacePtr{arr_ty, addr_space_int} : Ref{arr_ty} + ptr_ty = Core.AddrSpacePtr{arr_ty, addr_space_int} @eval $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride) @eval export $func_name diff --git a/src/device/pointer.jl b/src/device/pointer.jl index 14b942a9..7ea9cd4a 100644 --- a/src/device/pointer.jl +++ b/src/device/pointer.jl @@ -118,7 +118,7 @@ Base.:(+)(x::Integer, y::DevicePtr) = y + x T_int = convert(LLVMType, Int) T_ptr = convert(LLVMType, DevicePtr{T,A}) - T_actual_ptr = LLVM.PointerType(eltyp) + T_actual_ptr = LLVM.PointerType(eltyp, convert(Int, A)) # create a function param_types = [T_ptr, T_int] @@ -130,10 +130,8 @@ Base.:(+)(x::Integer, y::DevicePtr) = y + x position!(builder, entry) ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) - ptr = gep!(builder, ptr, [parameters(llvm_f)[2]]) - ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) - ld = load!(builder, ptr_with_as) + ld = load!(builder, ptr) if A != AS.Generic metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(A) @@ -153,7 +151,7 @@ end T_int = convert(LLVMType, Int) T_ptr = convert(LLVMType, DevicePtr{T,A}) - T_actual_ptr = LLVM.PointerType(eltyp) + T_actual_ptr = LLVM.PointerType(eltyp, convert(Int, A)) # create a function param_types = [T_ptr, eltyp, T_int] @@ -165,11 +163,9 @@ end position!(builder, entry) ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) - ptr = gep!(builder, ptr, [parameters(llvm_f)[3]]) - ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) val = parameters(llvm_f)[2] - st = store!(builder, val, ptr_with_as) + st = store!(builder, val, ptr) if A != AS.Generic metadata(st)[LLVM.MD_tbaa] = tbaa_addrspace(A) @@ -201,8 +197,7 @@ const LDGTypes = Union{UInt8, UInt16, UInt32, UInt64, T_int32 = LLVM.Int32Type(JuliaContext()) T_ptr = convert(LLVMType, DevicePtr{T,AS.Global}) - T_actual_ptr = LLVM.PointerType(eltyp) - T_actual_ptr_as = LLVM.PointerType(eltyp, convert(Int, AS.Global)) + T_actual_ptr = LLVM.PointerType(eltyp, convert(Int, AS.Global)) # create a function param_types = [T_ptr, T_int] @@ -222,7 +217,7 @@ const LDGTypes = Union{UInt8, UInt16, UInt32, UInt64, "llvm.nvvm.ldg.global.$class.$typ.p1$typ" end mod = LLVM.parent(llvm_f) - intrinsic_typ = LLVM.FunctionType(eltyp, [T_actual_ptr_as, T_int32]) + intrinsic_typ = LLVM.FunctionType(eltyp, [T_actual_ptr, T_int32]) intrinsic = LLVM.Function(mod, intrinsic_name, intrinsic_typ) # generate IR @@ -231,11 +226,9 @@ const LDGTypes = Union{UInt8, UInt16, UInt32, UInt64, position!(builder, entry) ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) - ptr = gep!(builder, ptr, [parameters(llvm_f)[2]]) - ptr_with_as = addrspacecast!(builder, ptr, T_actual_ptr_as) ld = call!(builder, intrinsic, - [ptr_with_as, ConstantInt(Int32(align), JuliaContext())]) + [ptr, ConstantInt(Int32(align), JuliaContext())]) metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(AS.Global) diff --git a/test/device/cuda.jl b/test/device/cuda.jl index 43e51037..6d43e6b1 100644 --- a/test/device/cuda.jl +++ b/test/device/cuda.jl @@ -1133,6 +1133,17 @@ end end end +@testset "shared memory" begin + function kernel() + shared = @cuStaticSharedMem(Float32, 1) + @atomic shared[threadIdx().x] += 0f0 + return + end + + @cuda kernel() + synchronize() +end + end end diff --git a/test/device/wmma.jl b/test/device/wmma.jl index 44f0a261..dfe9ae5e 100644 --- a/test/device/wmma.jl +++ b/test/device/wmma.jl @@ -1,6 +1,5 @@ # Need https://github.com/JuliaLang/julia/pull/33970 # and https://github.com/JuliaLang/julia/pull/34043 -if VERSION >= v"1.4.0-DEV.666" && capability(device()) >= v"7.0" using CUDAnative.WMMA @@ -294,4 +293,3 @@ end ################################################################################ end -end diff --git a/test/runtests.jl b/test/runtests.jl index 1cff84a2..38424346 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -33,7 +33,9 @@ include("device/execution.jl") include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") +if VERSION >= v"1.4.1" && capability(device()) >= v"7.0" include("device/wmma.jl") +end include("nvtx.jl")