bashbaug · bashbaug · Mar 23, 2026 · Feb 9, 2026 · Mar 14, 2026 · Mar 23, 2026
diff --git a/layers/10_cmdbufemu/README.md b/layers/10_cmdbufemu/README.md
@@ -34,6 +34,7 @@ The following environment variables can modify the behavior of the command buffe
 |----------------------|----------|-----------------|
 | `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue".  By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`<br/><br/>`set CMDBUFEMU_EnhancedErrorChecking=1` |
 | `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier.  By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`<br/><br/>`set CMDBUFEMU_KernelForProfiling=1` |
+| `CMDBUFEMU_SuggestedLocalWorkSize` | Enables use of the suggested local work-group size extension to eliminate `NULL` local work-group sizes.  Only valid when an implementation supports the local work-group size extension and the command is not mutable.  By default, use of the suggested local work-group size is enabled. | `export CMDBUFEMU_SuggestedLocalWorkSize=0`<br/><br/>`set CMDBUFEMU_SuggestedLocalWorkSize=0` |
 
 ## Known Limitations
 

diff --git a/layers/10_cmdbufemu/emulate.cpp b/layers/10_cmdbufemu/emulate.cpp
@@ -817,6 +817,7 @@ struct SVMMemFill : Command
 struct NDRangeKernel : Command
 {
     static std::unique_ptr<NDRangeKernel> create(
+        const bool isMutable,
         const cl_command_properties_khr* properties,
         cl_command_buffer_khr cmdbuf,
         cl_command_queue queue,
@@ -1235,6 +1236,11 @@ typedef struct _cl_command_buffer_khr
             cmdbuf->TestQueues.reserve(num_queues);
             cmdbuf->BlockingEvents.reserve(num_queues);
 
+            if( cmdbuf->Queues.size() == 1 )
+            {
+                cmdbuf->setupSuggestedLocalWorkSize();
+            }
+
             for( auto queue : cmdbuf->Queues )
             {
                 g_pNextDispatch->clRetainCommandQueue(queue);
@@ -1683,6 +1689,32 @@ typedef struct _cl_command_buffer_khr
         return CL_SUCCESS;
     }
 
+    cl_int  clGetKernelSuggestedLocalWorkSize(
+                cl_command_queue queue,
+                cl_kernel kernel,
+                cl_uint work_dim,
+                const size_t* global_work_offset,
+                const size_t* global_work_size,
+                size_t* suggested_local_work_size )
+    {
+        if( ptrGetKernelSuggestedLocalWorkSizeKHR == nullptr )
+        {
+            return CL_INVALID_OPERATION;
+        }
+        if( queue != nullptr && queue != Queues[0] )
+        {
+            return CL_INVALID_COMMAND_QUEUE;
+        }
+
+        return ptrGetKernelSuggestedLocalWorkSizeKHR(
+            Queues[0],
+            kernel,
+            work_dim,
+            global_work_offset,
+            global_work_size,
+            suggested_local_work_size );
+    }
+
 private:
     static constexpr cl_uint cMagic = 0x434d4442;   // "CMDB"
 
@@ -1703,6 +1735,32 @@ typedef struct _cl_command_buffer_khr
     std::vector<std::unique_ptr<Command>> Commands;
     std::atomic<uint32_t> NextSyncPoint;
 
+    clGetKernelSuggestedLocalWorkSizeKHR_fn ptrGetKernelSuggestedLocalWorkSizeKHR = nullptr;
+
+    void setupSuggestedLocalWorkSize()
+    {
+        cl_device_id device = nullptr;
+        g_pNextDispatch->clGetCommandQueueInfo(
+            Queues[0],
+            CL_QUEUE_DEVICE,
+            sizeof(device),
+            &device,
+            nullptr );
+
+        cl_platform_id platform = nullptr;
+        g_pNextDispatch->clGetDeviceInfo(
+            device,
+            CL_DEVICE_PLATFORM,
+            sizeof(platform),
+            &platform,
+            nullptr );
+
+        ptrGetKernelSuggestedLocalWorkSizeKHR = (clGetKernelSuggestedLocalWorkSizeKHR_fn)
+            g_pNextDispatch->clGetExtensionFunctionAddressForPlatform(
+                platform,
+                "clGetKernelSuggestedLocalWorkSizeKHR" );
+    }
+
     void setupTestQueue(cl_command_queue src)
     {
         if( g_EnhancedErrorChecking )
@@ -1847,6 +1905,7 @@ _cl_mutable_command_khr::_cl_mutable_command_khr(
     Queue(queue ? queue : cmdbuf->getQueue()) {}
 
 std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
+    const bool isMutable,
     const cl_command_properties_khr* properties,
     cl_command_buffer_khr cmdbuf,
     cl_command_queue queue,
@@ -1964,6 +2023,21 @@ std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
             local_work_size,
             local_work_size + work_dim);
     }
+    else if( g_SuggestedLocalWorkSize && isMutable == false )
+    {
+        command->local_work_size.resize(work_dim);
+        cl_int checkError = cmdbuf->clGetKernelSuggestedLocalWorkSize(
+            queue,
+            kernel,
+            work_dim,
+            global_work_offset,
+            global_work_size,
+            command->local_work_size.data() );
+        if( checkError != CL_SUCCESS )
+        {
+            command->local_work_size.clear();
+        }
+    }
 
     g_pNextDispatch->clRetainKernel(command->original_kernel);
 
@@ -2838,8 +2912,11 @@ cl_int CL_API_CALL clCommandNDRangeKernelKHR_EMU(
         }
     }
 
+    const bool isMutable = mutable_handle != nullptr;
+
     cl_int errorCode = CL_SUCCESS;
     auto command = NDRangeKernel::create(
+        isMutable,
         properties,
         cmdbuf,
         command_queue,

diff --git a/layers/10_cmdbufemu/emulate.h b/layers/10_cmdbufemu/emulate.h
@@ -11,6 +11,7 @@
 
 extern bool g_EnhancedErrorChecking;
 extern bool g_KernelForProfiling;
+extern bool g_SuggestedLocalWorkSize;
 
 extern const struct _cl_icd_dispatch* g_pNextDispatch;
 

diff --git a/layers/10_cmdbufemu/main.cpp b/layers/10_cmdbufemu/main.cpp
@@ -35,10 +35,16 @@
 bool g_EnhancedErrorChecking = false;
 
 // Using kernels for profiling can fix issues with some implementations
-// that do not properly support event profiling on barrkers.
+// that do not properly support event profiling on barriers.
 
 bool g_KernelForProfiling = false;
 
+// Using the suggested local work-group size can reduce overhead by determining
+// the values for a NULL local work-group size when the command buffer is
+// created rather than when it is executed.
+
+bool g_SuggestedLocalWorkSize = true;
+
 const struct _cl_icd_dispatch* g_pNextDispatch = NULL;
 
 static cl_int CL_API_CALL
@@ -231,7 +237,7 @@ static void _init_dispatch()
 }
 
 CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
-    cl_layer_info  param_name,
+    cl_layer_info param_name,
     size_t param_value_size,
     void* param_value,
     size_t* param_value_size_ret)
@@ -251,10 +257,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
 #if defined(CL_LAYER_NAME)
     case CL_LAYER_NAME:
         {
+            char str[256];
+            snprintf(str, 256, "Emulation Layer for "
+                CL_KHR_COMMAND_BUFFER_EXTENSION_NAME
+                " (EEC: %s, KFP: %s, SLWS: %s)",
+                g_EnhancedErrorChecking ? "Y" : "N",
+                g_KernelForProfiling ? "Y" : "N",
+                g_SuggestedLocalWorkSize ? "Y" : "N");
             auto ptr = (char*)param_value;
             return writeStringToMemory(
                 param_value_size,
-                "Emulation Layer for " CL_KHR_COMMAND_BUFFER_EXTENSION_NAME,
+                str,
                 param_value_size_ret,
                 ptr);
         }
@@ -290,6 +303,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(
 
     getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking);
     getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling);
+    getControl("CMDBUFEMU_SuggestedLocalWorkSize", g_SuggestedLocalWorkSize);
 
     g_pNextDispatch = target_dispatch;
 

diff --git a/samples/12_commandbuffers/main.cpp b/samples/12_commandbuffers/main.cpp
@@ -264,17 +264,17 @@ int main(
     cl_sync_point_khr sync_point;
     clCommandNDRangeKernelKHR(
         cmdbuf,
-        NULL,
-        NULL,
+        NULL,   // command queue, can be NULL to use the command buffer queue
+        NULL,   // command properties
         kernel(),
-        1,
-        NULL,
-        &gwx,
-        NULL,
-        0,
-        NULL,
+        1,      // work dim
+        NULL,   // global work offset
+        &gwx,   // global work size
+        NULL,   // local work size
+        0,      // num sync points in wait list
+        NULL,   // sync point wait list
         &sync_point,
-        NULL);
+        NULL);  // mutable handle
     clFinalizeCommandBufferKHR(cmdbuf);
 
     clEnqueueCommandBufferKHR(

diff --git a/samples/12_commandbufferspp/main.cpp b/samples/12_commandbufferspp/main.cpp
@@ -202,17 +202,17 @@ int main(
     cl_sync_point_khr sync_point;
     clCommandNDRangeKernelKHR(
         cmdbuf(),
-        NULL,
-        NULL,
+        NULL,   // command queue, can be NULL to use the command buffer queue
+        NULL,   // command properties
         kernel(),
-        1,
-        NULL,
-        &gwx,
-        NULL,
-        0,
-        NULL,
+        1,      // work dim
+        NULL,   // global work offset
+        &gwx,   // global work size
+        NULL,   // local work size
+        0,      // num sync points in wait list
+        NULL,   // sync point wait list
         &sync_point,
-        NULL);
+        NULL);  // mutable handle
     cmdbuf.finalize();
 
     clEnqueueCommandBufferKHR(

diff --git a/samples/14_ooqcommandbuffers/main.cpp b/samples/14_ooqcommandbuffers/main.cpp
@@ -152,17 +152,17 @@ int main(
     fillKernel.setArg(2, static_cast<cl_uint>(gwx));
     clCommandNDRangeKernelKHR(
         cmdbuf(),
-        nullptr,
-        nullptr,
+        nullptr,    // command queue, can be NULL to use the command buffer queue
+        nullptr,    // command properties
         fillKernel(),
-        1,
-        nullptr,
-        &one,
-        nullptr,
-        0,
-        nullptr,
+        1,          // work dim
+        nullptr,    // global work offset
+        &one,       // global work size
+        nullptr,    // local work size
+        0,          // num sync points in wait list
+        nullptr,    // sync point wait list
         &writeA,
-        nullptr);
+        nullptr);   // mutable handle
 
     cl_sync_point_khr writeB = 0;
     fillKernel.setArg(0, deviceMemSrcB);
@@ -188,17 +188,17 @@ int main(
     addKernel.setArg(2, deviceMemSrcB);
     clCommandNDRangeKernelKHR(
         cmdbuf(),
-        nullptr,
-        nullptr,
+        nullptr,    // command queue, can be NULL to use the command buffer queue
+        nullptr,    // command properties
         addKernel(),
-        1,
-        nullptr,
-        &gwx,
-        nullptr,
+        1,          // work dim
+        nullptr,    // global work offset
+        &gwx,       // global work size
+        nullptr,    // local work size
         static_cast<cl_uint>(waitList.size()),
         waitList.data(),
-        nullptr,
-        nullptr);
+        nullptr,    // sync point
+        nullptr);   // mutable handle
     cmdbuf.finalize();
 
     // Ensure the queue is empty and no processing is happening