Skip to content

Commit 583fb91

Browse files
authored
[SYCL] Support USM memcpy/copy in graphs (#186)
- Add support for USM memcpy and copy in graphs - New MemoryManager method for usm copies - New PI extension methods for enqueueing USM copies - Re-enabled and fixed issues in E2E tests for copies - Minor changes to queue_impl to capture copies correctly
1 parent 52ecaf3 commit 583fb91

File tree

13 files changed

+191
-35
lines changed

13 files changed

+191
-35
lines changed

sycl/include/sycl/detail/pi.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ _PI_API(piextCommandBufferRetain)
147147
_PI_API(piextCommandBufferRelease)
148148
_PI_API(piextCommandBufferFinalize)
149149
_PI_API(piextCommandBufferNDRangeKernel)
150+
_PI_API(piextCommandBufferMemcpyUSM)
150151
_PI_API(piextEnqueueCommandBuffer)
151152
_PI_API(piPluginGetLastError)
152153

sycl/include/sycl/detail/pi.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2157,7 +2157,7 @@ piextCommandBufferFinalize(pi_ext_command_buffer command_buffer);
21572157
/// \param local_work_size Local work size to use when executing kernel.
21582158
/// \param num_sync_points_in_wait_list The number of sync points in the
21592159
/// provided wait list.
2160-
/// \param sync_point_wait_list A list of sync points that this executions must
2160+
/// \param sync_point_wait_list A list of sync points that this command must
21612161
/// wait on.
21622162
/// \param sync_point The sync_point associated with this kernel execution.
21632163
__SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel(
@@ -2167,6 +2167,22 @@ __SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel(
21672167
const pi_ext_sync_point *sync_point_wait_list,
21682168
pi_ext_sync_point *sync_point);
21692169

2170+
/// API to append a USM memcpy command to the command-buffer.
2171+
/// \param command_buffer The command-buffer to append onto.
2172+
/// \param dst_ptr is the location the data will be copied
2173+
/// \param src_ptr is the data to be copied
2174+
/// \param size is number of bytes to copy
2175+
/// \param num_sync_points_in_wait_list The number of sync points in the
2176+
/// provided wait list.
2177+
/// \param sync_point_wait_list A list of sync points that this command must
2178+
/// wait on.
2179+
/// \param sync_point The sync_point associated with this memory operation.
2180+
__SYCL_EXPORT pi_result piextCommandBufferMemcpyUSM(
2181+
pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr,
2182+
size_t size, pi_uint32 num_sync_points_in_wait_list,
2183+
const pi_ext_sync_point *sync_point_wait_list,
2184+
pi_ext_sync_point *sync_point);
2185+
21702186
/// API to submit the command-buffer to queue for execution, returns an error if
21712187
/// command-buffer not finalized or another instance of same command-buffer
21722188
/// currently executing.

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5836,6 +5836,15 @@ pi_result cuda_piextCommandBufferNDRangeKernel(
58365836
return {};
58375837
}
58385838

5839+
pi_result cuda_piextCommandBufferMemcpyUSM(
5840+
pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr,
5841+
size_t size, pi_uint32 num_sync_points_in_wait_list,
5842+
const pi_ext_sync_point *sync_point_wait_list,
5843+
pi_ext_sync_point *sync_point) {
5844+
sycl::detail::pi::die("command-buffer API not implemented in CUDA backend");
5845+
return {};
5846+
}
5847+
58395848
pi_result cuda_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
58405849
pi_queue queue,
58415850
pi_uint32 num_events_in_wait_list,
@@ -6040,6 +6049,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
60406049
_PI_CL(piextCommandBufferRetain, cuda_piextCommandBufferRetain)
60416050
_PI_CL(piextCommandBufferRelease, cuda_piextCommandBufferRelease)
60426051
_PI_CL(piextCommandBufferNDRangeKernel, cuda_piextCommandBufferNDRangeKernel)
6052+
_PI_CL(piextCommandBufferMemcpyUSM, cuda_piextCommandBufferMemcpyUSM)
60436053
_PI_CL(piextEnqueueCommandBuffer, cuda_piextEnqueueCommandBuffer)
60446054

60456055
_PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)

sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2127,6 +2127,15 @@ pi_result piextCommandBufferNDRangeKernel(
21272127
DIE_NO_IMPLEMENTATION;
21282128
}
21292129

2130+
pi_result
2131+
piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
2132+
const void *src_ptr, size_t size,
2133+
pi_uint32 num_sync_points_in_wait_list,
2134+
const pi_ext_sync_point *sync_point_wait_list,
2135+
pi_ext_sync_point *sync_point) {
2136+
DIE_NO_IMPLEMENTATION;
2137+
}
2138+
21302139
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
21312140
pi_queue queue,
21322141
pi_uint32 num_events_in_wait_list,

sycl/plugins/hip/pi_hip.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5583,6 +5583,16 @@ pi_result hip_piextCommandBufferNDRangeKernel(
55835583
return {};
55845584
}
55855585

5586+
pi_result
5587+
hip_piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer,
5588+
void *dst_ptr, const void *src_ptr, size_t size,
5589+
pi_uint32 num_sync_points_in_wait_list,
5590+
const pi_ext_sync_point *sync_point_wait_list,
5591+
pi_ext_sync_point *sync_point) {
5592+
sycl::detail::pi::die("command-buffer API not implemented in HIP backend");
5593+
return {};
5594+
}
5595+
55865596
pi_result hip_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
55875597
pi_queue queue,
55885598
pi_uint32 num_events_in_wait_list,
@@ -5787,6 +5797,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
57875797
_PI_CL(piextCommandBufferRetain, hip_piextCommandBufferRetain)
57885798
_PI_CL(piextCommandBufferRelease, hip_piextCommandBufferRelease)
57895799
_PI_CL(piextCommandBufferNDRangeKernel, hip_piextCommandBufferNDRangeKernel)
5800+
_PI_CL(piextCommandBufferMemcpyUSM, hip_piextCommandBufferMemcpyUSM)
57905801
_PI_CL(piextEnqueueCommandBuffer, hip_piextEnqueueCommandBuffer)
57915802

57925803
_PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj)

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 60 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8831,6 +8831,23 @@ pi_result _pi_buffer::free() {
88318831

88328832
/// command-buffer Extension
88338833

8834+
/// Helper function to take a list of pi_ext_sync_points and fill the provided
8835+
/// vector with the associated ZeEvents
8836+
static pi_result getEventsFromSyncPoints(
8837+
const std::unordered_map<pi_ext_sync_point, pi_event> &SyncPoints,
8838+
size_t NumSyncPointsInWaitList, const pi_ext_sync_point *SyncPointWaitList,
8839+
std::vector<ze_event_handle_t> &ZeEventList) {
8840+
for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
8841+
if (auto EventHandle = SyncPoints.find(SyncPointWaitList[i]);
8842+
EventHandle != SyncPoints.end()) {
8843+
ZeEventList.push_back(EventHandle->second->ZeEvent);
8844+
} else {
8845+
return PI_ERROR_INVALID_VALUE;
8846+
}
8847+
}
8848+
return PI_SUCCESS;
8849+
}
8850+
88348851
pi_result piextCommandBufferCreate(pi_context Context, pi_device Device,
88358852
const pi_ext_command_buffer_desc *Desc,
88368853
pi_ext_command_buffer *RetCommandBuffer) {
@@ -8935,19 +8952,16 @@ pi_result piextCommandBufferNDRangeKernel(
89358952

89368953
ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
89378954

8938-
std::vector<ze_event_handle_t> ZeEventList(NumSyncPointsInWaitList);
8939-
for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
8940-
if (auto EventHandle = CommandBuffer->SyncPoints.find(SyncPointWaitList[i]);
8941-
EventHandle != CommandBuffer->SyncPoints.end()) {
8942-
ZeEventList[i] = CommandBuffer->SyncPoints[SyncPointWaitList[i]]->ZeEvent;
8943-
} else {
8944-
return PI_ERROR_INVALID_VALUE;
8945-
}
8955+
std::vector<ze_event_handle_t> ZeEventList;
8956+
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
8957+
NumSyncPointsInWaitList,
8958+
SyncPointWaitList, ZeEventList);
8959+
if (Res) {
8960+
return Res;
89468961
}
8947-
89488962
pi_event LaunchEvent;
8949-
auto res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
8950-
if (res)
8963+
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
8964+
if (Res)
89518965
return PI_ERROR_OUT_OF_HOST_MEMORY;
89528966

89538967
LaunchEvent->CommandData = (void *)Kernel;
@@ -8972,6 +8986,41 @@ pi_result piextCommandBufferNDRangeKernel(
89728986
return PI_SUCCESS;
89738987
}
89748988

8989+
pi_result piextCommandBufferMemcpyUSM(
8990+
pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr,
8991+
size_t Size, pi_uint32 NumSyncPointsInWaitList,
8992+
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
8993+
if (!DstPtr) {
8994+
return PI_ERROR_INVALID_VALUE;
8995+
}
8996+
8997+
std::vector<ze_event_handle_t> ZeEventList;
8998+
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
8999+
NumSyncPointsInWaitList,
9000+
SyncPointWaitList, ZeEventList);
9001+
if (Res) {
9002+
return Res;
9003+
}
9004+
9005+
pi_event LaunchEvent;
9006+
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
9007+
if (Res)
9008+
return PI_ERROR_OUT_OF_HOST_MEMORY;
9009+
9010+
ZE_CALL(zeCommandListAppendMemoryCopy,
9011+
(CommandBuffer->ZeCommandList, DstPtr, SrcPtr, Size,
9012+
LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));
9013+
9014+
urPrint("calling zeCommandListAppendMemoryCopy() with"
9015+
" ZeEvent %#lx\n",
9016+
ur_cast<std::uintptr_t>(LaunchEvent->ZeEvent));
9017+
9018+
// Get sync point and register the event with it.
9019+
*SyncPoint = CommandBuffer->GetNextSyncPoint();
9020+
CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
9021+
return PI_SUCCESS;
9022+
}
9023+
89759024
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer,
89769025
pi_queue Queue,
89779026
pi_uint32 NumEventsInWaitList,

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2300,6 +2300,16 @@ pi_result piextCommandBufferNDRangeKernel(
23002300
return {};
23012301
}
23022302

2303+
pi_result
2304+
piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
2305+
const void *src_ptr, size_t size,
2306+
pi_uint32 num_sync_points_in_wait_list,
2307+
const pi_ext_sync_point *sync_point_wait_list,
2308+
pi_ext_sync_point *sync_point) {
2309+
// Not implemented
2310+
return {};
2311+
}
2312+
23032313
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
23042314
pi_queue queue,
23052315
pi_uint32 num_events_in_wait_list,
@@ -2509,6 +2519,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
25092519
_PI_CL(piextCommandBufferRetain, piextCommandBufferRetain)
25102520
_PI_CL(piextCommandBufferRelease, piextCommandBufferRelease)
25112521
_PI_CL(piextCommandBufferNDRangeKernel, piextCommandBufferNDRangeKernel)
2522+
_PI_CL(piextCommandBufferMemcpyUSM, piextCommandBufferMemcpyUSM)
25122523
_PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer)
25132524

25142525
_PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj)

sycl/source/detail/memory_manager.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,21 @@ void MemoryManager::copy_from_device_global(
12231223
DepEvents, OutEvent);
12241224
}
12251225

1226+
// Command buffer methods
1227+
void MemoryManager::ext_oneapi_copy_usm_cmd_buffer(
1228+
ContextImplPtr Context, const void *SrcMem,
1229+
RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem,
1230+
std::vector<RT::PiExtSyncPoint> Deps, RT::PiExtSyncPoint *OutSyncPoint) {
1231+
if (!SrcMem || !DstMem)
1232+
throw runtime_error("NULL pointer argument in memory copy operation.",
1233+
PI_ERROR_INVALID_VALUE);
1234+
1235+
const PluginPtr &Plugin = Context->getPlugin();
1236+
Plugin->call<PiApiKind::piextCommandBufferMemcpyUSM>(
1237+
CommandBuffer, DstMem, SrcMem, Len, Deps.size(), Deps.data(),
1238+
OutSyncPoint);
1239+
}
1240+
12261241
} // namespace detail
12271242
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
12281243
} // namespace sycl

sycl/source/detail/memory_manager.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,12 @@ class __SYCL_EXPORT MemoryManager {
173173
const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue,
174174
size_t NumBytes, size_t Offset, void *DstMem, OSModuleHandle M,
175175
const std::vector<RT::PiEvent> &DepEvents, RT::PiEvent *OutEvent);
176+
177+
// Command buffer extension methods
178+
static void ext_oneapi_copy_usm_cmd_buffer(
179+
ContextImplPtr Context, const void *SrcMem,
180+
RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem,
181+
std::vector<RT::PiExtSyncPoint> Deps, RT::PiExtSyncPoint *OutSyncPoint);
176182
};
177183
} // namespace detail
178184
} // __SYCL_INLINE_VER_NAMESPACE(_V1)

sycl/source/detail/queue_impl.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,16 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
146146
// Emit a begin/end scope for this call
147147
PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
148148
#endif
149+
// If we have a command graph set we need to capture the copy through normal
150+
// queue submission rather than execute the copy directly.
151+
if (MGraph) {
152+
return submit(
153+
[&](handler &CGH) {
154+
CGH.depends_on(DepEvents);
155+
CGH.memcpy(Dest, Src, Count);
156+
},
157+
Self, {});
158+
}
149159
if (MHasDiscardEventsSupport) {
150160
MemoryManager::copy_usm(Src, Self, Count, Dest,
151161
getOrWaitEvents(DepEvents, MContext), nullptr);

0 commit comments

Comments
 (0)