Skip to content

Commit a66edc0

Browse files
committed
vulkan: change graph_compute to be async and enable get_tensor_async
This allows some additional CPU/GPU overlap for large pp workloads. Also seems to help a bit for token gen, maybe getting rid of a small bubble between graph_compute and get_tensor. Async set and copy functions seem to be very rarely used, so I didn't enable them because I didn't have a good way to test them. The async commands need to be ordered against each other, so put them all on the compute queue. The non-async commands still use the transfer queue. The fence for graph_compute/get_tensor_async is submitted and waited on in ggml_vk_synchronize.
1 parent 0c74f32 commit a66edc0

File tree

1 file changed

+49
-38
lines changed

1 file changed

+49
-38
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ class vk_memory_logger;
234234
#endif
235235
class vk_perf_logger;
236236
static void ggml_vk_destroy_buffer(vk_buffer& buf);
237+
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
237238

238239
static constexpr uint32_t mul_mat_vec_max_cols = 8;
239240
static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -1583,6 +1584,7 @@ struct ggml_backend_vk_context {
15831584
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset;
15841585
vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials;
15851586
vk::Fence fence, almost_ready_fence;
1587+
bool submit_pending {};
15861588
bool almost_ready_fence_pending {};
15871589
// Set before op_add and unset after op_rms_norm to indicate that the add should
15881590
// write partial sums to accumulate the square of the vector components
@@ -11204,8 +11206,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
1120411206
if (subctx) {
1120511207
// Submit and wait for any pending work before reallocating the buffers
1120611208
ggml_vk_ctx_end(subctx);
11207-
ggml_vk_submit(subctx, ctx->fence);
11208-
ggml_vk_wait_for_fence(ctx);
11209+
ggml_vk_submit(subctx, {});
11210+
ctx->submit_pending = true;
11211+
ggml_vk_synchronize(ctx);
1120911212
ggml_vk_ctx_begin(ctx->device, subctx);
1121011213
}
1121111214

@@ -11243,7 +11246,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
1124311246
}
1124411247
}
1124511248

11246-
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready);
11249+
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready);
1124711250

1124811251
// Returns true if node has enqueued work into the queue, false otherwise
1124911252
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
@@ -11787,7 +11790,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1178711790

1178811791
ctx->compute_ctx.reset();
1178911792

11790-
bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, false, almost_ready);
11793+
bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready);
1179111794
if (!ok) {
1179211795
if (node->op == GGML_OP_UNARY) {
1179311796
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
@@ -11802,7 +11805,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1180211805
return true;
1180311806
}
1180411807

11805-
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
11808+
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) {
1180611809
GGML_UNUSED(cgraph);
1180711810
ggml_backend_buffer * buf = nullptr;
1180811811

@@ -11919,16 +11922,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1191911922

1192011923
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
1192111924

11922-
// always wait for the GPU work to be done for the last submit
11923-
if (tensor_idx == subctx->exit_tensor_idx) {
11924-
use_fence = true;
11925-
}
11926-
1192711925
// Only run if ctx hasn't been submitted yet
1192811926
if (!subctx->seqs.empty()) {
1192911927
#ifdef GGML_VULKAN_CHECK_RESULTS
1193011928
ggml_vk_check_results_0(ctx, cgraph, tensor_idx);
11931-
use_fence = true;
1193211929
#endif
1193311930

1193411931
// Do staging buffer copies
@@ -11940,17 +11937,16 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1194011937
memset(mset.dst, mset.val, mset.n);
1194111938
}
1194211939

11943-
if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence) {
11940+
if (almost_ready && !ctx->almost_ready_fence_pending) {
1194411941
ggml_vk_submit(subctx, ctx->almost_ready_fence);
1194511942
ctx->almost_ready_fence_pending = true;
1194611943
} else {
11947-
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
11944+
ggml_vk_submit(subctx, {});
1194811945
}
11946+
ctx->submit_pending = true;
1194911947

11950-
if (use_fence) {
11951-
ggml_vk_wait_for_fence(ctx);
11952-
}
1195311948
#ifdef GGML_VULKAN_CHECK_RESULTS
11949+
ggml_vk_synchronize(ctx);
1195411950
ggml_vk_check_results_1(ctx, cgraph, tensor_idx);
1195511951
#endif
1195611952
}
@@ -12305,7 +12301,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
1230512301

1230612302
if (ctx->transfer_ctx.expired()) {
1230712303
// Initialize new transfer context
12308-
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
12304+
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1230912305
ctx->transfer_ctx = transfer_ctx;
1231012306
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
1231112307
} else {
@@ -12328,7 +12324,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
1232812324

1232912325
if (ctx->transfer_ctx.expired()) {
1233012326
// Initialize new transfer context
12331-
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
12327+
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1233212328
ctx->transfer_ctx = transfer_ctx;
1233312329
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
1233412330
} else {
@@ -12351,7 +12347,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
1235112347

1235212348
if (ctx->transfer_ctx.expired()) {
1235312349
// Initialize new transfer context
12354-
transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool);
12350+
transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool);
1235512351
ctx->transfer_ctx = transfer_ctx;
1235612352
ggml_vk_ctx_begin(ctx->device, transfer_ctx);
1235712353
} else {
@@ -12368,29 +12364,46 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
1236812364
return false;
1236912365
}
1237012366

12371-
static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
12372-
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
12373-
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
12374-
if(ctx->transfer_ctx.expired()) {
12375-
return;
12376-
}
12367+
static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
12368+
VK_LOG_DEBUG("ggml_vk_synchronize()");
1237712369

12378-
vk_context transfer_ctx = ctx->transfer_ctx.lock();
12370+
bool do_transfer = !ctx->transfer_ctx.expired();
1237912371

12380-
ggml_vk_ctx_end(transfer_ctx);
12372+
vk_context transfer_ctx;
12373+
if (do_transfer) {
12374+
transfer_ctx = ctx->transfer_ctx.lock();
1238112375

12382-
for (auto& cpy : transfer_ctx->in_memcpys) {
12383-
memcpy(cpy.dst, cpy.src, cpy.n);
12376+
ggml_vk_ctx_end(transfer_ctx);
12377+
12378+
for (auto& cpy : transfer_ctx->in_memcpys) {
12379+
memcpy(cpy.dst, cpy.src, cpy.n);
12380+
}
12381+
12382+
ggml_vk_submit(transfer_ctx, {});
12383+
ctx->submit_pending = true;
1238412384
}
1238512385

12386-
ggml_vk_submit(transfer_ctx, ctx->fence);
12387-
ggml_vk_wait_for_fence(ctx);
12386+
if (ctx->submit_pending) {
12387+
ctx->device->compute_queue.queue.submit({}, ctx->fence);
12388+
ggml_vk_wait_for_fence(ctx);
12389+
ctx->submit_pending = false;
12390+
}
1238812391

12389-
for (auto& cpy : transfer_ctx->out_memcpys) {
12390-
memcpy(cpy.dst, cpy.src, cpy.n);
12392+
if (do_transfer) {
12393+
for (auto& cpy : transfer_ctx->out_memcpys) {
12394+
memcpy(cpy.dst, cpy.src, cpy.n);
12395+
}
12396+
ctx->transfer_ctx.reset();
1239112397
}
12398+
}
12399+
12400+
static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
12401+
VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
12402+
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
12403+
12404+
ggml_vk_synchronize(ctx);
1239212405

12393-
ctx->transfer_ctx.reset();
12406+
ggml_vk_graph_cleanup(ctx);
1239412407
}
1239512408

1239612409
static bool ggml_vk_is_empty(ggml_tensor * node) {
@@ -12932,8 +12945,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1293212945
ctx->device->perf_logger->print_timings();
1293312946
}
1293412947

12935-
ggml_vk_graph_cleanup(ctx);
12936-
1293712948
return GGML_STATUS_SUCCESS;
1293812949

1293912950
UNUSED(backend);
@@ -13162,9 +13173,9 @@ static ggml_backend_i ggml_backend_vk_interface = {
1316213173
/* .get_name = */ ggml_backend_vk_name,
1316313174
/* .free = */ ggml_backend_vk_free,
1316413175
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
13165-
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
13176+
/* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
1316613177
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
13167-
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
13178+
/* .synchronize = */ ggml_backend_vk_synchronize,
1316813179
/* .graph_plan_create = */ NULL,
1316913180
/* .graph_plan_free = */ NULL,
1317013181
/* .graph_plan_update = */ NULL,

0 commit comments

Comments
 (0)