Markus Tavenrath OccamRazor commited on
Commit
11bc9e6
·
1 Parent(s): e1e87a3

Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. (llama/8943)

Browse files

* Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead.

- Allocation overhead for the temporary std::vectors was easily detectable with a sampling profiler and simple to remove.
- ggml_vk_sync_buffer introduce a full pipeline sync which has a significant cost on the GPU side, sometimes larger than the actual kernel execution. Adding only barriers for shader read/writes and transfers seems to be sufficient looking at the code which either launches compute kernels or copies tensors.

* Fix small typo

---------

Co-authored-by: 0cc4m <[email protected]>

Files changed (1) hide show
  1. ggml/src/ggml-vulkan.cpp +34 -31
ggml/src/ggml-vulkan.cpp CHANGED
@@ -270,6 +270,10 @@ struct vk_subbuffer {
270
  vk_buffer buffer;
271
  uint64_t offset;
272
  uint64_t size;
 
 
 
 
273
  };
274
 
275
  struct vk_semaphore {
@@ -1065,13 +1069,14 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
1065
 
1066
  static void ggml_vk_sync_buffers(vk_context& ctx) {
1067
  VK_LOG_DEBUG("ggml_vk_sync_buffers()");
1068
- const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
1069
-
1070
  ctx->s->buffer.pipelineBarrier(
1071
  ctx->q->stage_flags,
1072
  ctx->q->stage_flags,
1073
  {},
1074
- mem_barriers,
 
 
 
1075
  {},
1076
  {}
1077
  );
@@ -2424,28 +2429,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
2424
  return s;
2425
  }
2426
 
2427
- static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
 
 
2428
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2429
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2430
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2431
  VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2432
- for (auto& buffer : buffers) {
2433
- std::cerr << "(" << buffer.buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2434
  }
2435
  std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
2436
- std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
2437
- std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
2438
  GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
2439
- GGML_ASSERT(buffers.size() == pipeline->parameter_count);
2440
- vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
2441
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
2442
- descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
2443
- }
2444
- for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
2445
- write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
2446
- }
2447
 
2448
- ctx->device->device.updateDescriptorSets(write_descriptor_sets, {});
 
 
2449
 
2450
  subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
2451
  subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
@@ -3127,7 +3127,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
3127
  } else if (qx_needs_dequant) {
3128
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3129
  ggml_vk_sync_buffers(subctx);
3130
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3131
  }
3132
  if (y_non_contig) {
3133
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3316,7 +3316,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
3316
  };
3317
  ggml_vk_sync_buffers(subctx);
3318
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
3319
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} },
3320
  sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
3321
  }
3322
 
@@ -3388,7 +3388,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3388
  // compute
3389
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3390
  ggml_vk_sync_buffers(subctx);
3391
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3392
  }
3393
 
3394
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3463,7 +3463,8 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3463
  // compute
3464
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3465
  ggml_vk_sync_buffers(subctx);
3466
- ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
 
3467
  }
3468
 
3469
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3638,7 +3639,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
3638
  } else if (qx_needs_dequant) {
3639
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3640
  ggml_vk_sync_buffers(subctx);
3641
- ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
 
3642
  }
3643
  if (y_non_contig) {
3644
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
@@ -3838,7 +3840,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
3838
  };
3839
  ggml_vk_sync_buffers(subctx);
3840
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
3841
- { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23}, { d_ids, ids_buf_offset, ids_sz } },
 
3842
  sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
3843
  }
3844
 
@@ -4399,7 +4402,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4399
  }
4400
 
4401
  ggml_vk_sync_buffers(subctx);
4402
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4403
  } else if (op == GGML_OP_ROPE) {
4404
  // Empty src2 is possible in rope, but the shader needs a buffer
4405
  vk_subbuffer subbuf_z;
@@ -4410,20 +4413,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4410
  }
4411
 
4412
  ggml_vk_sync_buffers(subctx);
4413
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4414
  } else if (op == GGML_OP_IM2COL) {
4415
  // im2col uses only src1 and dst buffers
4416
  ggml_vk_sync_buffers(subctx);
4417
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4418
  } else if (use_src2) {
4419
  ggml_vk_sync_buffers(subctx);
4420
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4421
  } else if (use_src1) {
4422
  ggml_vk_sync_buffers(subctx);
4423
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4424
  } else {
4425
  ggml_vk_sync_buffers(subctx);
4426
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4427
  }
4428
  } else {
4429
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
@@ -4460,10 +4463,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
4460
 
4461
  if (use_src1) {
4462
  ggml_vk_sync_buffers(subctx);
4463
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4464
  } else {
4465
  ggml_vk_sync_buffers(subctx);
4466
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4467
  }
4468
  }
4469
  }
 
270
  vk_buffer buffer;
271
  uint64_t offset;
272
  uint64_t size;
273
+
274
+ operator vk::DescriptorBufferInfo() const {
275
+ return { buffer->buffer, offset, size };
276
+ }
277
  };
278
 
279
  struct vk_semaphore {
 
1069
 
1070
  static void ggml_vk_sync_buffers(vk_context& ctx) {
1071
  VK_LOG_DEBUG("ggml_vk_sync_buffers()");
 
 
1072
  ctx->s->buffer.pipelineBarrier(
1073
  ctx->q->stage_flags,
1074
  ctx->q->stage_flags,
1075
  {},
1076
+ { {
1077
+ {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite},
1078
+ {vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}
1079
+ } },
1080
  {},
1081
  {}
1082
  );
 
2429
  return s;
2430
  }
2431
 
2432
+
2433
+
2434
+ static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
2435
  const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
2436
  const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
2437
  const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
2438
  VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
2439
+ for (auto& buffer : descriptor_buffer_infos) {
2440
+ std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
2441
  }
2442
  std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
 
 
2443
  GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
2444
+ GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
 
 
 
 
 
 
 
2445
 
2446
+ vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
2447
+ vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
2448
+ ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
2449
 
2450
  subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
2451
  subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
 
3127
  } else if (qx_needs_dequant) {
3128
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3129
  ggml_vk_sync_buffers(subctx);
3130
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3131
  }
3132
  if (y_non_contig) {
3133
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
 
3316
  };
3317
  ggml_vk_sync_buffers(subctx);
3318
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
3319
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
3320
  sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
3321
  }
3322
 
 
3388
  // compute
3389
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3390
  ggml_vk_sync_buffers(subctx);
3391
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3392
  }
3393
 
3394
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3463
  // compute
3464
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3465
  ggml_vk_sync_buffers(subctx);
3466
+ ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
3467
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3468
  }
3469
 
3470
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3639
  } else if (qx_needs_dequant) {
3640
  const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
3641
  ggml_vk_sync_buffers(subctx);
3642
+ ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
3643
+ { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
3644
  }
3645
  if (y_non_contig) {
3646
  ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
 
3840
  };
3841
  ggml_vk_sync_buffers(subctx);
3842
  ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
3843
+ { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
3844
+ vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
3845
  sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
3846
  }
3847
 
 
4402
  }
4403
 
4404
  ggml_vk_sync_buffers(subctx);
4405
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4406
  } else if (op == GGML_OP_ROPE) {
4407
  // Empty src2 is possible in rope, but the shader needs a buffer
4408
  vk_subbuffer subbuf_z;
 
4413
  }
4414
 
4415
  ggml_vk_sync_buffers(subctx);
4416
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4417
  } else if (op == GGML_OP_IM2COL) {
4418
  // im2col uses only src1 and dst buffers
4419
  ggml_vk_sync_buffers(subctx);
4420
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4421
  } else if (use_src2) {
4422
  ggml_vk_sync_buffers(subctx);
4423
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4424
  } else if (use_src1) {
4425
  ggml_vk_sync_buffers(subctx);
4426
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4427
  } else {
4428
  ggml_vk_sync_buffers(subctx);
4429
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4430
  }
4431
  } else {
4432
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
 
4463
 
4464
  if (use_src1) {
4465
  ggml_vk_sync_buffers(subctx);
4466
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4467
  } else {
4468
  ggml_vk_sync_buffers(subctx);
4469
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4470
  }
4471
  }
4472
  }