@@ -234,6 +234,7 @@ class vk_memory_logger;
234234#endif
235235class vk_perf_logger;
236236static void ggml_vk_destroy_buffer(vk_buffer& buf);
237+ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx);
237238
238239static constexpr uint32_t mul_mat_vec_max_cols = 8;
239240static constexpr uint32_t p021_max_gqa_ratio = 8;
@@ -1583,6 +1584,7 @@ struct ggml_backend_vk_context {
15831584 size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset;
15841585 vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials;
15851586 vk::Fence fence, almost_ready_fence;
1587+ bool submit_pending {};
15861588 bool almost_ready_fence_pending {};
15871589 // Set before op_add and unset after op_rms_norm to indicate that the add should
15881590 // write partial sums to accumulate the square of the vector components
@@ -11204,8 +11206,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
1120411206 if (subctx) {
1120511207 // Submit and wait for any pending work before reallocating the buffers
1120611208 ggml_vk_ctx_end(subctx);
11207- ggml_vk_submit(subctx, ctx->fence);
11208- ggml_vk_wait_for_fence(ctx);
11209+ ggml_vk_submit(subctx, {});
11210+ ctx->submit_pending = true;
11211+ ggml_vk_synchronize(ctx);
1120911212 ggml_vk_ctx_begin(ctx->device, subctx);
1121011213 }
1121111214
@@ -11243,7 +11246,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx, vk_contex
1124311246 }
1124411247}
1124511248
11246- static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready);
11249+ static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool almost_ready);
1124711250
1124811251// Returns true if node has enqueued work into the queue, false otherwise
1124911252// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
@@ -11787,7 +11790,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1178711790
1178811791 ctx->compute_ctx.reset();
1178911792
11790- bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, false, almost_ready);
11793+ bool ok = ggml_vk_compute_forward(ctx, cgraph, node_begin, node_idx_begin, almost_ready);
1179111794 if (!ok) {
1179211795 if (node->op == GGML_OP_UNARY) {
1179311796 std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
@@ -11802,7 +11805,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
1180211805 return true;
1180311806}
1180411807
11805- static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool use_fence = true, bool almost_ready = false) {
11808+ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * cgraph, ggml_tensor * tensor, int tensor_idx, bool almost_ready = false) {
1180611809 GGML_UNUSED(cgraph);
1180711810 ggml_backend_buffer * buf = nullptr;
1180811811
@@ -11919,16 +11922,10 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1191911922
1192011923 vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
1192111924
11922- // always wait for the GPU work to be done for the last submit
11923- if (tensor_idx == subctx->exit_tensor_idx) {
11924- use_fence = true;
11925- }
11926-
1192711925 // Only run if ctx hasn't been submitted yet
1192811926 if (!subctx->seqs.empty()) {
1192911927#ifdef GGML_VULKAN_CHECK_RESULTS
1193011928 ggml_vk_check_results_0(ctx, cgraph, tensor_idx);
11931- use_fence = true;
1193211929#endif
1193311930
1193411931 // Do staging buffer copies
@@ -11940,17 +11937,16 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
1194011937 memset(mset.dst, mset.val, mset.n);
1194111938 }
1194211939
11943- if (almost_ready && !ctx->almost_ready_fence_pending && !use_fence ) {
11940+ if (almost_ready && !ctx->almost_ready_fence_pending) {
1194411941 ggml_vk_submit(subctx, ctx->almost_ready_fence);
1194511942 ctx->almost_ready_fence_pending = true;
1194611943 } else {
11947- ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence {});
11944+ ggml_vk_submit(subctx, {});
1194811945 }
11946+ ctx->submit_pending = true;
1194911947
11950- if (use_fence) {
11951- ggml_vk_wait_for_fence(ctx);
11952- }
1195311948#ifdef GGML_VULKAN_CHECK_RESULTS
11949+ ggml_vk_synchronize(ctx);
1195411950 ggml_vk_check_results_1(ctx, cgraph, tensor_idx);
1195511951#endif
1195611952 }
@@ -12305,7 +12301,7 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor
1230512301
1230612302 if (ctx->transfer_ctx.expired()) {
1230712303 // Initialize new transfer context
12308- transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool );
12304+ transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool );
1230912305 ctx->transfer_ctx = transfer_ctx;
1231012306 ggml_vk_ctx_begin(ctx->device, transfer_ctx);
1231112307 } else {
@@ -12328,7 +12324,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_
1232812324
1232912325 if (ctx->transfer_ctx.expired()) {
1233012326 // Initialize new transfer context
12331- transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool );
12327+ transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool );
1233212328 ctx->transfer_ctx = transfer_ctx;
1233312329 ggml_vk_ctx_begin(ctx->device, transfer_ctx);
1233412330 } else {
@@ -12351,7 +12347,7 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
1235112347
1235212348 if (ctx->transfer_ctx.expired()) {
1235312349 // Initialize new transfer context
12354- transfer_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool );
12350+ transfer_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool );
1235512351 ctx->transfer_ctx = transfer_ctx;
1235612352 ggml_vk_ctx_begin(ctx->device, transfer_ctx);
1235712353 } else {
@@ -12368,29 +12364,46 @@ static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_
1236812364 return false;
1236912365}
1237012366
12371- static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
12372- VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
12373- ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
12374- if(ctx->transfer_ctx.expired()) {
12375- return;
12376- }
12367+ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) {
12368+ VK_LOG_DEBUG("ggml_vk_synchronize()");
1237712369
12378- vk_context transfer_ctx = ctx->transfer_ctx.lock ();
12370+ bool do_transfer = ! ctx->transfer_ctx.expired ();
1237912371
12380- ggml_vk_ctx_end(transfer_ctx);
12372+ vk_context transfer_ctx;
12373+ if (do_transfer) {
12374+ transfer_ctx = ctx->transfer_ctx.lock();
1238112375
12382- for (auto& cpy : transfer_ctx->in_memcpys) {
12383- memcpy(cpy.dst, cpy.src, cpy.n);
12376+ ggml_vk_ctx_end(transfer_ctx);
12377+
12378+ for (auto& cpy : transfer_ctx->in_memcpys) {
12379+ memcpy(cpy.dst, cpy.src, cpy.n);
12380+ }
12381+
12382+ ggml_vk_submit(transfer_ctx, {});
12383+ ctx->submit_pending = true;
1238412384 }
1238512385
12386- ggml_vk_submit(transfer_ctx, ctx->fence);
12387- ggml_vk_wait_for_fence(ctx);
12386+ if (ctx->submit_pending) {
12387+ ctx->device->compute_queue.queue.submit({}, ctx->fence);
12388+ ggml_vk_wait_for_fence(ctx);
12389+ ctx->submit_pending = false;
12390+ }
1238812391
12389- for (auto& cpy : transfer_ctx->out_memcpys) {
12390- memcpy(cpy.dst, cpy.src, cpy.n);
12392+ if (do_transfer) {
12393+ for (auto& cpy : transfer_ctx->out_memcpys) {
12394+ memcpy(cpy.dst, cpy.src, cpy.n);
12395+ }
12396+ ctx->transfer_ctx.reset();
1239112397 }
12398+ }
12399+
12400+ static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
12401+ VK_LOG_DEBUG("ggml_backend_vk_synchronize()");
12402+ ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
12403+
12404+ ggml_vk_synchronize(ctx);
1239212405
12393- ctx->transfer_ctx.reset( );
12406+ ggml_vk_graph_cleanup(ctx );
1239412407}
1239512408
1239612409static bool ggml_vk_is_empty(ggml_tensor * node) {
@@ -12932,8 +12945,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
1293212945 ctx->device->perf_logger->print_timings();
1293312946 }
1293412947
12935- ggml_vk_graph_cleanup(ctx);
12936-
1293712948 return GGML_STATUS_SUCCESS;
1293812949
1293912950 UNUSED(backend);
@@ -13162,9 +13173,9 @@ static ggml_backend_i ggml_backend_vk_interface = {
1316213173 /* .get_name = */ ggml_backend_vk_name,
1316313174 /* .free = */ ggml_backend_vk_free,
1316413175 /* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
13165- /* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
13176+ /* .get_tensor_async = */ ggml_backend_vk_get_tensor_async,
1316613177 /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
13167- /* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
13178+ /* .synchronize = */ ggml_backend_vk_synchronize,
1316813179 /* .graph_plan_create = */ NULL,
1316913180 /* .graph_plan_free = */ NULL,
1317013181 /* .graph_plan_update = */ NULL,
0 commit comments