Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions layers/10_cmdbufemu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ The following environment variables can modify the behavior of the command buffe
|----------------------|----------|-----------------|
| `CMDBUFEMU_EnhancedErrorChecking` | Enables additional error checking when commands are added to a command buffer using a command buffer "test queue". By default, the additional error checking is disabled. | `export CMDBUFEMU_EnhancedErrorChecking=1`<br/><br/>`set CMDBUFEMU_EnhancedErrorChecking=1` |
| `CMDBUFEMU_KernelForProfiling` | Enables use of an empty kernel for event profiling instead of event profiling on a command-queue barrier. By default, to minimize overhead, the empty kernel is not used. | `export CMDBUFEMU_KernelForProfiling=1`<br/><br/>`set CMDBUFEMU_KernelForProfiling=1` |
| `CMDBUFEMU_SuggestedLocalWorkSize` | Enables use of the suggested local work-group size extension to eliminate `NULL` local work-group sizes. Only valid when an implementation supports the local work-group size extension and the command is not mutable. By default, use of the suggested local work-group size is enabled. | `export CMDBUFEMU_SuggestedLocalWorkSize=0`<br/><br/>`set CMDBUFEMU_SuggestedLocalWorkSize=0` |

## Known Limitations

Expand Down
77 changes: 77 additions & 0 deletions layers/10_cmdbufemu/emulate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,7 @@ struct SVMMemFill : Command
struct NDRangeKernel : Command
{
static std::unique_ptr<NDRangeKernel> create(
const bool isMutable,
const cl_command_properties_khr* properties,
cl_command_buffer_khr cmdbuf,
cl_command_queue queue,
Expand Down Expand Up @@ -1235,6 +1236,11 @@ typedef struct _cl_command_buffer_khr
cmdbuf->TestQueues.reserve(num_queues);
cmdbuf->BlockingEvents.reserve(num_queues);

if( cmdbuf->Queues.size() == 1 )
{
cmdbuf->setupSuggestedLocalWorkSize();
}

for( auto queue : cmdbuf->Queues )
{
g_pNextDispatch->clRetainCommandQueue(queue);
Expand Down Expand Up @@ -1683,6 +1689,32 @@ typedef struct _cl_command_buffer_khr
return CL_SUCCESS;
}

cl_int clGetKernelSuggestedLocalWorkSize(
cl_command_queue queue,
cl_kernel kernel,
cl_uint work_dim,
const size_t* global_work_offset,
const size_t* global_work_size,
size_t* suggested_local_work_size )
{
if( ptrGetKernelSuggestedLocalWorkSizeKHR == nullptr )
{
return CL_INVALID_OPERATION;
}
if( queue != nullptr && queue != Queues[0] )
{
return CL_INVALID_COMMAND_QUEUE;
}

return ptrGetKernelSuggestedLocalWorkSizeKHR(
Queues[0],
kernel,
work_dim,
global_work_offset,
global_work_size,
suggested_local_work_size );
}

private:
static constexpr cl_uint cMagic = 0x434d4442; // "CMDB"

Expand All @@ -1703,6 +1735,32 @@ typedef struct _cl_command_buffer_khr
std::vector<std::unique_ptr<Command>> Commands;
std::atomic<uint32_t> NextSyncPoint;

clGetKernelSuggestedLocalWorkSizeKHR_fn ptrGetKernelSuggestedLocalWorkSizeKHR = nullptr;

void setupSuggestedLocalWorkSize()
{
cl_device_id device = nullptr;
g_pNextDispatch->clGetCommandQueueInfo(
Queues[0],
CL_QUEUE_DEVICE,
sizeof(device),
&device,
nullptr );

cl_platform_id platform = nullptr;
g_pNextDispatch->clGetDeviceInfo(
device,
CL_DEVICE_PLATFORM,
sizeof(platform),
&platform,
nullptr );

ptrGetKernelSuggestedLocalWorkSizeKHR = (clGetKernelSuggestedLocalWorkSizeKHR_fn)
g_pNextDispatch->clGetExtensionFunctionAddressForPlatform(
platform,
"clGetKernelSuggestedLocalWorkSizeKHR" );
}

void setupTestQueue(cl_command_queue src)
{
if( g_EnhancedErrorChecking )
Expand Down Expand Up @@ -1847,6 +1905,7 @@ _cl_mutable_command_khr::_cl_mutable_command_khr(
Queue(queue ? queue : cmdbuf->getQueue()) {}

std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
const bool isMutable,
const cl_command_properties_khr* properties,
cl_command_buffer_khr cmdbuf,
cl_command_queue queue,
Expand Down Expand Up @@ -1964,6 +2023,21 @@ std::unique_ptr<NDRangeKernel> NDRangeKernel::create(
local_work_size,
local_work_size + work_dim);
}
else if( g_SuggestedLocalWorkSize && isMutable == false )
{
command->local_work_size.resize(work_dim);
cl_int checkError = cmdbuf->clGetKernelSuggestedLocalWorkSize(
queue,
kernel,
work_dim,
global_work_offset,
global_work_size,
command->local_work_size.data() );
if( checkError != CL_SUCCESS )
{
command->local_work_size.clear();
}
}

g_pNextDispatch->clRetainKernel(command->original_kernel);

Expand Down Expand Up @@ -2838,8 +2912,11 @@ cl_int CL_API_CALL clCommandNDRangeKernelKHR_EMU(
}
}

const bool isMutable = mutable_handle != nullptr;

cl_int errorCode = CL_SUCCESS;
auto command = NDRangeKernel::create(
isMutable,
properties,
cmdbuf,
command_queue,
Expand Down
1 change: 1 addition & 0 deletions layers/10_cmdbufemu/emulate.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

extern bool g_EnhancedErrorChecking;
extern bool g_KernelForProfiling;
extern bool g_SuggestedLocalWorkSize;

extern const struct _cl_icd_dispatch* g_pNextDispatch;

Expand Down
20 changes: 17 additions & 3 deletions layers/10_cmdbufemu/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,16 @@
bool g_EnhancedErrorChecking = false;

// Using kernels for profiling can fix issues with some implementations
// that do not properly support event profiling on barrkers.
// that do not properly support event profiling on barriers.

bool g_KernelForProfiling = false;

// Using the suggested local work-group size can reduce overhead by determining
// the values for a NULL local work-group size when the command buffer is
// created rather than when it is executed.

bool g_SuggestedLocalWorkSize = true;

const struct _cl_icd_dispatch* g_pNextDispatch = NULL;

static cl_int CL_API_CALL
Expand Down Expand Up @@ -231,7 +237,7 @@ static void _init_dispatch()
}

CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
cl_layer_info param_name,
cl_layer_info param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret)
Expand All @@ -251,10 +257,17 @@ CL_API_ENTRY cl_int CL_API_CALL clGetLayerInfo(
#if defined(CL_LAYER_NAME)
case CL_LAYER_NAME:
{
char str[256];
snprintf(str, 256, "Emulation Layer for "
CL_KHR_COMMAND_BUFFER_EXTENSION_NAME
" (EEC: %s, KFP: %s, SLWS: %s)",
g_EnhancedErrorChecking ? "Y" : "N",
g_KernelForProfiling ? "Y" : "N",
g_SuggestedLocalWorkSize ? "Y" : "N");
auto ptr = (char*)param_value;
return writeStringToMemory(
param_value_size,
"Emulation Layer for " CL_KHR_COMMAND_BUFFER_EXTENSION_NAME,
str,
param_value_size_ret,
ptr);
}
Expand Down Expand Up @@ -290,6 +303,7 @@ CL_API_ENTRY cl_int CL_API_CALL clInitLayerWithProperties(

getControl("CMDBUFEMU_EnhancedErrorChecking", g_EnhancedErrorChecking);
getControl("CMDBUFEMU_KernelForProfiling", g_KernelForProfiling);
getControl("CMDBUFEMU_SuggestedLocalWorkSize", g_SuggestedLocalWorkSize);

g_pNextDispatch = target_dispatch;

Expand Down
18 changes: 9 additions & 9 deletions samples/12_commandbuffers/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -264,17 +264,17 @@ int main(
cl_sync_point_khr sync_point;
clCommandNDRangeKernelKHR(
cmdbuf,
NULL,
NULL,
NULL, // command queue, can be NULL to use the command buffer queue
NULL, // command properties
kernel(),
1,
NULL,
&gwx,
NULL,
0,
NULL,
1, // work dim
NULL, // global work offset
&gwx, // global work size
NULL, // local work size
0, // num sync points in wait list
NULL, // sync point wait list
&sync_point,
NULL);
NULL); // mutable handle
clFinalizeCommandBufferKHR(cmdbuf);

clEnqueueCommandBufferKHR(
Expand Down
18 changes: 9 additions & 9 deletions samples/12_commandbufferspp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,17 +202,17 @@ int main(
cl_sync_point_khr sync_point;
clCommandNDRangeKernelKHR(
cmdbuf(),
NULL,
NULL,
NULL, // command queue, can be NULL to use the command buffer queue
NULL, // command properties
kernel(),
1,
NULL,
&gwx,
NULL,
0,
NULL,
1, // work dim
NULL, // global work offset
&gwx, // global work size
NULL, // local work size
0, // num sync points in wait list
NULL, // sync point wait list
&sync_point,
NULL);
NULL); // mutable handle
cmdbuf.finalize();

clEnqueueCommandBufferKHR(
Expand Down
34 changes: 17 additions & 17 deletions samples/14_ooqcommandbuffers/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,17 +152,17 @@ int main(
fillKernel.setArg(2, static_cast<cl_uint>(gwx));
clCommandNDRangeKernelKHR(
cmdbuf(),
nullptr,
nullptr,
nullptr, // command queue, can be NULL to use the command buffer queue
nullptr, // command properties
fillKernel(),
1,
nullptr,
&one,
nullptr,
0,
nullptr,
1, // work dim
nullptr, // global work offset
&one, // global work size
nullptr, // local work size
0, // num sync points in wait list
nullptr, // sync point wait list
&writeA,
nullptr);
nullptr); // mutable handle

cl_sync_point_khr writeB = 0;
fillKernel.setArg(0, deviceMemSrcB);
Expand All @@ -188,17 +188,17 @@ int main(
addKernel.setArg(2, deviceMemSrcB);
clCommandNDRangeKernelKHR(
cmdbuf(),
nullptr,
nullptr,
nullptr, // command queue, can be NULL to use the command buffer queue
nullptr, // command properties
addKernel(),
1,
nullptr,
&gwx,
nullptr,
1, // work dim
nullptr, // global work offset
&gwx, // global work size
nullptr, // local work size
static_cast<cl_uint>(waitList.size()),
waitList.data(),
nullptr,
nullptr);
nullptr, // sync point
nullptr); // mutable handle
cmdbuf.finalize();

// Ensure the queue is empty and no processing is happening
Expand Down