@@ -122,6 +122,15 @@ function codegen(target::Symbol, job::CompilerJob;
122122 end
123123
124124 kernel_fn = LLVM. name (kernel)
125+
126+ if libraries
127+ # linking the device run-time library requires use of the CUDA linker,
128+ # which in turn switches compilation to device relocatable code (-rdc) mode.
129+ #
130+ # even if not doing any actual calls that need -rdc (i.e., calls to the run-time
131+ # library), this significantly hurts performance, so don't do it unconditionally
132+ need_libcudadevrt = ! isempty (decls (ir))
133+ end
125134 end
126135
127136 # dynamic parallelism
@@ -227,12 +236,16 @@ function codegen(target::Symbol, job::CompilerJob;
227236 jit_options[CUDAdrv. GENERATE_DEBUG_INFO] = true
228237 end
229238
230- # link the CUDA device library
231- @timeit to[] " linking" begin
232- linker = CUDAdrv. CuLink (jit_options)
233- CUDAdrv. add_file! (linker, libcudadevrt, CUDAdrv. LIBRARY)
234- CUDAdrv. add_data! (linker, kernel_fn, asm)
235- image = CUDAdrv. complete (linker)
239+ if libraries && need_libcudadevrt
240+ # link the CUDA device library
241+ @timeit to[] " linking" begin
242+ linker = CUDAdrv. CuLink (jit_options)
243+ CUDAdrv. add_file! (linker, libcudadevrt, CUDAdrv. LIBRARY)
244+ CUDAdrv. add_data! (linker, kernel_fn, asm)
245+ image = CUDAdrv. complete (linker)
246+ end
247+ else
248+ image = asm
236249 end
237250
238251 @timeit to[] " compilation" begin
0 commit comments