@@ -162,28 +162,27 @@ struct HostKernel{F,TT} <: AbstractKernel{F,TT}
162162end
163163
164164function launch_configuration (kernel:: HostKernel{F,TT} ) where {F,TT}
165- # XXX : have the user pass in a global size to clamp against
166- # maxGroupSizeX/Y/Z?
167-
168- # XXX : shrink until a multiple of preferredGroupSize?
165+ # Level Zero's zeKernelSuggestGroupSize provides a launch configuration
166+ # that exactly cover the input size. This can result in very awkward
167+ # configurations, so roll our own version that behaves like CUDA's
168+ # occupancy API and assumes the kernel still does bounds checking.
169169
170170 # once the MAX_GROUP_SIZE extension is implemented, we can use it here
171171 kernel_props = oneL0. properties (kernel. fun)
172- if kernel_props. maxGroupSize != = missing
173- return kernel_props. maxGroupSize
172+ group_size = if kernel_props. maxGroupSize != = missing
173+ kernel_props. maxGroupSize
174+ else
175+ dev = kernel. fun. mod. device
176+ compute_props = oneL0. compute_properties (dev)
177+ max_size = compute_props. maxTotalGroupSize
178+
179+ # # when the kernel uses many registers (which we can't query without
180+ # # extensions that landed _after_ MAX_GROUP_SIZE, so don't bother)
181+ # # the groupsize should be halved
182+ group_size = max_size ÷ 2
174183 end
175184
176- # otherwise, we'd use `zeKernelSuggestGroupSize` but it's been observed
177- # to return really bad configs (JuliaGPU/oneAPI.jl#430)
178-
179- # so instead, calculate it ourselves based on the device properties
180- dev = kernel. fun. mod. device
181- compute_props = oneL0. compute_properties (dev)
182- max_size = compute_props. maxTotalGroupSize
183- # # when the kernel uses many registers (which we can't query without
184- # # extensions that landed _after_ MAX_GROUP_SIZE, so don't bother)
185- # # the groupsize should be halved
186- group_size = max_size ÷ 2
185+ # TODO : align the group size based on preferredGroupSize
187186
188187 return group_size
189188end
0 commit comments