diff --git a/layers/10_cmdbufemu/README.md b/layers/10_cmdbufemu/README.md index 20002798..39f43b71 100644 --- a/layers/10_cmdbufemu/README.md +++ b/layers/10_cmdbufemu/README.md @@ -34,6 +34,7 @@ The following environment variables can modify the behavior of the command buffe |----------------------|----------|-----------------| | `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue". By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`

`set CMDBUFEMU_EnhancedErrorChecking=1` | | `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier. By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`

`set CMDBUFEMU_KernelForProfiling=1` | +| `CMDBUFEMU_SuggestedLocalWorkSize` | Enables use of the suggested local work-group size extension to eliminate `NULL` local work-group sizes. Only valid when an implementation supports the local work-group size extension and the command is not mutable. By default, use of the suggested local work-group size is enabled. | `export CMDBUFEMU_SuggestedLocalWorkSize=0`

`set CMDBUFEMU_SuggestedLocalWorkSize=0` | ## Known Limitations diff --git a/layers/10_cmdbufemu/emulate.cpp b/layers/10_cmdbufemu/emulate.cpp index 4929fdbb..4b475fe1 100644 --- a/layers/10_cmdbufemu/emulate.cpp +++ b/layers/10_cmdbufemu/emulate.cpp @@ -817,6 +817,7 @@ struct SVMMemFill : Command struct NDRangeKernel : Command { static std::unique_ptr create( + const bool isMutable, const cl_command_properties_khr* properties, cl_command_buffer_khr cmdbuf, cl_command_queue queue, @@ -1235,6 +1236,11 @@ typedef struct _cl_command_buffer_khr cmdbuf->TestQueues.reserve(num_queues); cmdbuf->BlockingEvents.reserve(num_queues); + if( cmdbuf->Queues.size() == 1 ) + { + cmdbuf->setupSuggestedLocalWorkSize(); + } + for( auto queue : cmdbuf->Queues ) { g_pNextDispatch->clRetainCommandQueue(queue); @@ -1683,6 +1689,32 @@ typedef struct _cl_command_buffer_khr return CL_SUCCESS; } + cl_int clGetKernelSuggestedLocalWorkSize( + cl_command_queue queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t* global_work_offset, + const size_t* global_work_size, + size_t* suggested_local_work_size ) + { + if( ptrGetKernelSuggestedLocalWorkSizeKHR == nullptr ) + { + return CL_INVALID_OPERATION; + } + if( queue != nullptr && queue != Queues[0] ) + { + return CL_INVALID_COMMAND_QUEUE; + } + + return ptrGetKernelSuggestedLocalWorkSizeKHR( + Queues[0], + kernel, + work_dim, + global_work_offset, + global_work_size, + suggested_local_work_size ); + } + private: static constexpr cl_uint cMagic = 0x434d4442; // "CMDB" @@ -1703,6 +1735,32 @@ typedef struct _cl_command_buffer_khr std::vector> Commands; std::atomic NextSyncPoint; + clGetKernelSuggestedLocalWorkSizeKHR_fn ptrGetKernelSuggestedLocalWorkSizeKHR = nullptr; + + void setupSuggestedLocalWorkSize() + { + cl_device_id device = nullptr; + g_pNextDispatch->clGetCommandQueueInfo( + Queues[0], + CL_QUEUE_DEVICE, + sizeof(device), + &device, + nullptr ); + + cl_platform_id platform = nullptr; + g_pNextDispatch->clGetDeviceInfo( + device, + CL_DEVICE_PLATFORM, + sizeof(platform), + &platform, + nullptr ); + + ptrGetKernelSuggestedLocalWorkSizeKHR = (clGetKernelSuggestedLocalWorkSizeKHR_fn) + g_pNextDispatch->clGetExtensionFunctionAddressForPlatform( + platform, + "clGetKernelSuggestedLocalWorkSizeKHR" ); + } + void setupTestQueue(cl_command_queue src) { if( g_EnhancedErrorChecking ) @@ -1847,6 +1905,7 @@ _cl_mutable_command_khr::_cl_mutable_command_khr( Queue(queue ? queue : cmdbuf->getQueue()) {} std::unique_ptr NDRangeKernel::create( + const bool isMutable, const cl_command_properties_khr* properties, cl_command_buffer_khr cmdbuf, cl_command_queue queue, @@ -1964,6 +2023,21 @@ std::unique_ptr NDRangeKernel::create( local_work_size, local_work_size + work_dim); } + else if( g_SuggestedLocalWorkSize && isMutable == false ) + { + command->local_work_size.resize(work_dim); + cl_int checkError = cmdbuf->clGetKernelSuggestedLocalWorkSize( + queue, + kernel, + work_dim, + global_work_offset, + global_work_size, + command->local_work_size.data() ); + if( checkError != CL_SUCCESS ) + { + command->local_work_size.clear(); + } + } g_pNextDispatch->clRetainKernel(command->original_kernel); @@ -2838,8 +2912,11 @@ cl_int CL_API_CALL clCommandNDRangeKernelKHR_EMU( } } + const bool isMutable = mutable_handle != nullptr; + cl_int errorCode = CL_SUCCESS; auto command = NDRangeKernel::create( + isMutable, properties, cmdbuf, command_queue, diff --git a/layers/10_cmdbufemu/emulate.h b/layers/10_cmdbufemu/emulate.h index a2e9ccd9..4bd494ef 100644 --- a/layers/10_cmdbufemu/emulate.h +++ b/layers/10_cmdbufemu/emulate.h @@ -11,6 +11,7 @@ extern bool g_EnhancedErrorChecking; extern bool g_KernelForProfiling; +extern bool g_SuggestedLocalWorkSize; extern const struct _cl_icd_dispatch* g_pNextDispatch; diff --git a/layers/10_cmdbufemu/main.cpp b/layers/10_cmdbufemu/main.cpp index b1c78f66..2ac2d9df 100644 --- a/layers/10_cmdbufemu/main.cpp +++ b/layers/10_cmdbufemu/main.cpp @@ -35,10 +35,16 @@ bool g_EnhancedErrorChecking = false; // Using kernels for profiling can fix issues with some implementations -// that do not properly support event profiling on barrkers. +// that do not properly support event profiling on barriers. bool g_KernelForProfiling = false; +// Using the suggested local work-group size can reduce overhead by determining +// the values for a NULL local work-group size when the command buffer is +// created rather than when it is executed. + +bool g_SuggestedLocalWorkSize = true; + const struct _cl_icd_dispatch* g_pNextDispatch = NULL; static cl_int CL_API_CALL @@ -231,7 +237,7 @@ static void _init_dispatch() } CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo( - cl_layer_info param_name, + cl_layer_info param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) @@ -251,10 +257,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo( #if defined(CL_LAYER_NAME) case CL_LAYER_NAME: { + char str[256]; + snprintf(str, 256, "Emulation Layer for " + CL_KHR_COMMAND_BUFFER_EXTENSION_NAME + " (EEC: %s, KFP: %s, SLWS: %s)", + g_EnhancedErrorChecking ? "Y" : "N", + g_KernelForProfiling ? "Y" : "N", + g_SuggestedLocalWorkSize ? "Y" : "N"); auto ptr = (char*)param_value; return writeStringToMemory( param_value_size, - "Emulation Layer for " CL_KHR_COMMAND_BUFFER_EXTENSION_NAME, + str, param_value_size_ret, ptr); } @@ -290,6 +303,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties( getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking); getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling); + getControl("CMDBUFEMU_SuggestedLocalWorkSize", g_SuggestedLocalWorkSize); g_pNextDispatch = target_dispatch; diff --git a/samples/12_commandbuffers/main.cpp b/samples/12_commandbuffers/main.cpp index f176a5c7..879a69f9 100644 --- a/samples/12_commandbuffers/main.cpp +++ b/samples/12_commandbuffers/main.cpp @@ -264,17 +264,17 @@ int main( cl_sync_point_khr sync_point; clCommandNDRangeKernelKHR( cmdbuf, - NULL, - NULL, + NULL, // command queue, can be NULL to use the command buffer queue + NULL, // command properties kernel(), - 1, - NULL, - &gwx, - NULL, - 0, - NULL, + 1, // work dim + NULL, // global work offset + &gwx, // global work size + NULL, // local work size + 0, // num sync points in wait list + NULL, // sync point wait list &sync_point, - NULL); + NULL); // mutable handle clFinalizeCommandBufferKHR(cmdbuf); clEnqueueCommandBufferKHR( diff --git a/samples/12_commandbufferspp/main.cpp b/samples/12_commandbufferspp/main.cpp index 9e566b7c..460939f8 100644 --- a/samples/12_commandbufferspp/main.cpp +++ b/samples/12_commandbufferspp/main.cpp @@ -202,17 +202,17 @@ int main( cl_sync_point_khr sync_point; clCommandNDRangeKernelKHR( cmdbuf(), - NULL, - NULL, + NULL, // command queue, can be NULL to use the command buffer queue + NULL, // command properties kernel(), - 1, - NULL, - &gwx, - NULL, - 0, - NULL, + 1, // work dim + NULL, // global work offset + &gwx, // global work size + NULL, // local work size + 0, // num sync points in wait list + NULL, // sync point wait list &sync_point, - NULL); + NULL); // mutable handle cmdbuf.finalize(); clEnqueueCommandBufferKHR( diff --git a/samples/14_ooqcommandbuffers/main.cpp b/samples/14_ooqcommandbuffers/main.cpp index abed0ffe..679bce0d 100644 --- a/samples/14_ooqcommandbuffers/main.cpp +++ b/samples/14_ooqcommandbuffers/main.cpp @@ -152,17 +152,17 @@ int main( fillKernel.setArg(2, static_cast(gwx)); clCommandNDRangeKernelKHR( cmdbuf(), - nullptr, - nullptr, + nullptr, // command queue, can be NULL to use the command buffer queue + nullptr, // command properties fillKernel(), - 1, - nullptr, - &one, - nullptr, - 0, - nullptr, + 1, // work dim + nullptr, // global work offset + &one, // global work size + nullptr, // local work size + 0, // num sync points in wait list + nullptr, // sync point wait list &writeA, - nullptr); + nullptr); // mutable handle cl_sync_point_khr writeB = 0; fillKernel.setArg(0, deviceMemSrcB); @@ -188,17 +188,17 @@ int main( addKernel.setArg(2, deviceMemSrcB); clCommandNDRangeKernelKHR( cmdbuf(), - nullptr, - nullptr, + nullptr, // command queue, can be NULL to use the command buffer queue + nullptr, // command properties addKernel(), - 1, - nullptr, - &gwx, - nullptr, + 1, // work dim + nullptr, // global work offset + &gwx, // global work size + nullptr, // local work size static_cast(waitList.size()), waitList.data(), - nullptr, - nullptr); + nullptr, // sync point + nullptr); // mutable handle cmdbuf.finalize(); // Ensure the queue is empty and no processing is happening