diff --git a/layers/10_cmdbufemu/README.md b/layers/10_cmdbufemu/README.md
index 20002798..39f43b71 100644
--- a/layers/10_cmdbufemu/README.md
+++ b/layers/10_cmdbufemu/README.md
@@ -34,6 +34,7 @@ The following environment variables can modify the behavior of the command buffe
|----------------------|----------|-----------------|
| `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue". By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`
`set CMDBUFEMU_EnhancedErrorChecking=1` |
| `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier. By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`
`set CMDBUFEMU_KernelForProfiling=1` |
+| `CMDBUFEMU_SuggestedLocalWorkSize` | Enables use of the suggested local work-group size extension to eliminate `NULL` local work-group sizes. Only valid when an implementation supports the local work-group size extension and the command is not mutable. By default, use of the suggested local work-group size is enabled. | `export CMDBUFEMU_SuggestedLocalWorkSize=0`
`set CMDBUFEMU_SuggestedLocalWorkSize=0` |
## Known Limitations
diff --git a/layers/10_cmdbufemu/emulate.cpp b/layers/10_cmdbufemu/emulate.cpp
index 4929fdbb..4b475fe1 100644
--- a/layers/10_cmdbufemu/emulate.cpp
+++ b/layers/10_cmdbufemu/emulate.cpp
@@ -817,6 +817,7 @@ struct SVMMemFill : Command
struct NDRangeKernel : Command
{
static std::unique_ptr create(
+ const bool isMutable,
const cl_command_properties_khr* properties,
cl_command_buffer_khr cmdbuf,
cl_command_queue queue,
@@ -1235,6 +1236,11 @@ typedef struct _cl_command_buffer_khr
cmdbuf->TestQueues.reserve(num_queues);
cmdbuf->BlockingEvents.reserve(num_queues);
+ if( cmdbuf->Queues.size() == 1 )
+ {
+ cmdbuf->setupSuggestedLocalWorkSize();
+ }
+
for( auto queue : cmdbuf->Queues )
{
g_pNextDispatch->clRetainCommandQueue(queue);
@@ -1683,6 +1689,32 @@ typedef struct _cl_command_buffer_khr
return CL_SUCCESS;
}
+ cl_int clGetKernelSuggestedLocalWorkSize(
+ cl_command_queue queue,
+ cl_kernel kernel,
+ cl_uint work_dim,
+ const size_t* global_work_offset,
+ const size_t* global_work_size,
+ size_t* suggested_local_work_size )
+ {
+ if( ptrGetKernelSuggestedLocalWorkSizeKHR == nullptr )
+ {
+ return CL_INVALID_OPERATION;
+ }
+ if( queue != nullptr && queue != Queues[0] )
+ {
+ return CL_INVALID_COMMAND_QUEUE;
+ }
+
+ return ptrGetKernelSuggestedLocalWorkSizeKHR(
+ Queues[0],
+ kernel,
+ work_dim,
+ global_work_offset,
+ global_work_size,
+ suggested_local_work_size );
+ }
+
private:
static constexpr cl_uint cMagic = 0x434d4442; // "CMDB"
@@ -1703,6 +1735,32 @@ typedef struct _cl_command_buffer_khr
std::vector> Commands;
std::atomic NextSyncPoint;
+ clGetKernelSuggestedLocalWorkSizeKHR_fn ptrGetKernelSuggestedLocalWorkSizeKHR = nullptr;
+
+ void setupSuggestedLocalWorkSize()
+ {
+ cl_device_id device = nullptr;
+ g_pNextDispatch->clGetCommandQueueInfo(
+ Queues[0],
+ CL_QUEUE_DEVICE,
+ sizeof(device),
+ &device,
+ nullptr );
+
+ cl_platform_id platform = nullptr;
+ g_pNextDispatch->clGetDeviceInfo(
+ device,
+ CL_DEVICE_PLATFORM,
+ sizeof(platform),
+ &platform,
+ nullptr );
+
+ ptrGetKernelSuggestedLocalWorkSizeKHR = (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+ g_pNextDispatch->clGetExtensionFunctionAddressForPlatform(
+ platform,
+ "clGetKernelSuggestedLocalWorkSizeKHR" );
+ }
+
void setupTestQueue(cl_command_queue src)
{
if( g_EnhancedErrorChecking )
@@ -1847,6 +1905,7 @@ _cl_mutable_command_khr::_cl_mutable_command_khr(
Queue(queue ? queue : cmdbuf->getQueue()) {}
std::unique_ptr NDRangeKernel::create(
+ const bool isMutable,
const cl_command_properties_khr* properties,
cl_command_buffer_khr cmdbuf,
cl_command_queue queue,
@@ -1964,6 +2023,21 @@ std::unique_ptr NDRangeKernel::create(
local_work_size,
local_work_size + work_dim);
}
+ else if( g_SuggestedLocalWorkSize && isMutable == false )
+ {
+ command->local_work_size.resize(work_dim);
+ cl_int checkError = cmdbuf->clGetKernelSuggestedLocalWorkSize(
+ queue,
+ kernel,
+ work_dim,
+ global_work_offset,
+ global_work_size,
+ command->local_work_size.data() );
+ if( checkError != CL_SUCCESS )
+ {
+ command->local_work_size.clear();
+ }
+ }
g_pNextDispatch->clRetainKernel(command->original_kernel);
@@ -2838,8 +2912,11 @@ cl_int CL_API_CALL clCommandNDRangeKernelKHR_EMU(
}
}
+ const bool isMutable = mutable_handle != nullptr;
+
cl_int errorCode = CL_SUCCESS;
auto command = NDRangeKernel::create(
+ isMutable,
properties,
cmdbuf,
command_queue,
diff --git a/layers/10_cmdbufemu/emulate.h b/layers/10_cmdbufemu/emulate.h
index a2e9ccd9..4bd494ef 100644
--- a/layers/10_cmdbufemu/emulate.h
+++ b/layers/10_cmdbufemu/emulate.h
@@ -11,6 +11,7 @@
extern bool g_EnhancedErrorChecking;
extern bool g_KernelForProfiling;
+extern bool g_SuggestedLocalWorkSize;
extern const struct _cl_icd_dispatch* g_pNextDispatch;
diff --git a/layers/10_cmdbufemu/main.cpp b/layers/10_cmdbufemu/main.cpp
index b1c78f66..2ac2d9df 100644
--- a/layers/10_cmdbufemu/main.cpp
+++ b/layers/10_cmdbufemu/main.cpp
@@ -35,10 +35,16 @@
bool g_EnhancedErrorChecking = false;
// Using kernels for profiling can fix issues with some implementations
-// that do not properly support event profiling on barrkers.
+// that do not properly support event profiling on barriers.
bool g_KernelForProfiling = false;
+// Using the suggested local work-group size can reduce overhead by determining
+// the values for a NULL local work-group size when the command buffer is
+// created rather than when it is executed.
+
+bool g_SuggestedLocalWorkSize = true;
+
const struct _cl_icd_dispatch* g_pNextDispatch = NULL;
static cl_int CL_API_CALL
@@ -231,7 +237,7 @@ static void _init_dispatch()
}
CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
- cl_layer_info param_name,
+ cl_layer_info param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret)
@@ -251,10 +257,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
#if defined(CL_LAYER_NAME)
case CL_LAYER_NAME:
{
+ char str[256];
+ snprintf(str, 256, "Emulation Layer for "
+ CL_KHR_COMMAND_BUFFER_EXTENSION_NAME
+ " (EEC: %s, KFP: %s, SLWS: %s)",
+ g_EnhancedErrorChecking ? "Y" : "N",
+ g_KernelForProfiling ? "Y" : "N",
+ g_SuggestedLocalWorkSize ? "Y" : "N");
auto ptr = (char*)param_value;
return writeStringToMemory(
param_value_size,
- "Emulation Layer for " CL_KHR_COMMAND_BUFFER_EXTENSION_NAME,
+ str,
param_value_size_ret,
ptr);
}
@@ -290,6 +303,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking);
getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling);
+ getControl("CMDBUFEMU_SuggestedLocalWorkSize", g_SuggestedLocalWorkSize);
g_pNextDispatch = target_dispatch;
diff --git a/samples/12_commandbuffers/main.cpp b/samples/12_commandbuffers/main.cpp
index f176a5c7..879a69f9 100644
--- a/samples/12_commandbuffers/main.cpp
+++ b/samples/12_commandbuffers/main.cpp
@@ -264,17 +264,17 @@ int main(
cl_sync_point_khr sync_point;
clCommandNDRangeKernelKHR(
cmdbuf,
- NULL,
- NULL,
+ NULL, // command queue, can be NULL to use the command buffer queue
+ NULL, // command properties
kernel(),
- 1,
- NULL,
- &gwx,
- NULL,
- 0,
- NULL,
+ 1, // work dim
+ NULL, // global work offset
+ &gwx, // global work size
+ NULL, // local work size
+ 0, // num sync points in wait list
+ NULL, // sync point wait list
&sync_point,
- NULL);
+ NULL); // mutable handle
clFinalizeCommandBufferKHR(cmdbuf);
clEnqueueCommandBufferKHR(
diff --git a/samples/12_commandbufferspp/main.cpp b/samples/12_commandbufferspp/main.cpp
index 9e566b7c..460939f8 100644
--- a/samples/12_commandbufferspp/main.cpp
+++ b/samples/12_commandbufferspp/main.cpp
@@ -202,17 +202,17 @@ int main(
cl_sync_point_khr sync_point;
clCommandNDRangeKernelKHR(
cmdbuf(),
- NULL,
- NULL,
+ NULL, // command queue, can be NULL to use the command buffer queue
+ NULL, // command properties
kernel(),
- 1,
- NULL,
- &gwx,
- NULL,
- 0,
- NULL,
+ 1, // work dim
+ NULL, // global work offset
+ &gwx, // global work size
+ NULL, // local work size
+ 0, // num sync points in wait list
+ NULL, // sync point wait list
&sync_point,
- NULL);
+ NULL); // mutable handle
cmdbuf.finalize();
clEnqueueCommandBufferKHR(
diff --git a/samples/14_ooqcommandbuffers/main.cpp b/samples/14_ooqcommandbuffers/main.cpp
index abed0ffe..679bce0d 100644
--- a/samples/14_ooqcommandbuffers/main.cpp
+++ b/samples/14_ooqcommandbuffers/main.cpp
@@ -152,17 +152,17 @@ int main(
fillKernel.setArg(2, static_cast(gwx));
clCommandNDRangeKernelKHR(
cmdbuf(),
- nullptr,
- nullptr,
+ nullptr, // command queue, can be NULL to use the command buffer queue
+ nullptr, // command properties
fillKernel(),
- 1,
- nullptr,
- &one,
- nullptr,
- 0,
- nullptr,
+ 1, // work dim
+ nullptr, // global work offset
+ &one, // global work size
+ nullptr, // local work size
+ 0, // num sync points in wait list
+ nullptr, // sync point wait list
&writeA,
- nullptr);
+ nullptr); // mutable handle
cl_sync_point_khr writeB = 0;
fillKernel.setArg(0, deviceMemSrcB);
@@ -188,17 +188,17 @@ int main(
addKernel.setArg(2, deviceMemSrcB);
clCommandNDRangeKernelKHR(
cmdbuf(),
- nullptr,
- nullptr,
+ nullptr, // command queue, can be NULL to use the command buffer queue
+ nullptr, // command properties
addKernel(),
- 1,
- nullptr,
- &gwx,
- nullptr,
+ 1, // work dim
+ nullptr, // global work offset
+ &gwx, // global work size
+ nullptr, // local work size
static_cast(waitList.size()),
waitList.data(),
- nullptr,
- nullptr);
+ nullptr, // sync point
+ nullptr); // mutable handle
cmdbuf.finalize();
// Ensure the queue is empty and no processing is happening