Spaces:
Running
Running
Commit
·
11bc9e6
1
Parent(s):
e1e87a3
Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead. (llama/8943)
Browse files* Optimize Vulkan backend for better CPU performance and less GPU synchronization overhead.
- Allocation overhead for the temporary std::vectors was easily detectable with a sampling profiler and simple to remove.
- ggml_vk_sync_buffer introduce a full pipeline sync which has a significant cost on the GPU side, sometimes larger than the actual kernel execution. Adding only barriers for shader read/writes and transfers seems to be sufficient looking at the code which either launches compute kernels or copies tensors.
* Fix small typo
---------
Co-authored-by: 0cc4m <[email protected]>
- ggml/src/ggml-vulkan.cpp +34 -31
ggml/src/ggml-vulkan.cpp
CHANGED
|
@@ -270,6 +270,10 @@ struct vk_subbuffer {
|
|
| 270 |
vk_buffer buffer;
|
| 271 |
uint64_t offset;
|
| 272 |
uint64_t size;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
};
|
| 274 |
|
| 275 |
struct vk_semaphore {
|
|
@@ -1065,13 +1069,14 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) {
|
|
| 1065 |
|
| 1066 |
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
| 1067 |
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
| 1068 |
-
const std::vector<vk::MemoryBarrier> mem_barriers{ { { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite }, { vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite } } };
|
| 1069 |
-
|
| 1070 |
ctx->s->buffer.pipelineBarrier(
|
| 1071 |
ctx->q->stage_flags,
|
| 1072 |
ctx->q->stage_flags,
|
| 1073 |
{},
|
| 1074 |
-
|
|
|
|
|
|
|
|
|
|
| 1075 |
{},
|
| 1076 |
{}
|
| 1077 |
);
|
|
@@ -2424,28 +2429,23 @@ static vk_submission ggml_vk_begin_submission(vk_device& device, vk_queue& q, bo
|
|
| 2424 |
return s;
|
| 2425 |
}
|
| 2426 |
|
| 2427 |
-
|
|
|
|
|
|
|
| 2428 |
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
| 2429 |
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
| 2430 |
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
| 2431 |
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
| 2432 |
-
for (auto& buffer :
|
| 2433 |
-
std::cerr << "(" << buffer
|
| 2434 |
}
|
| 2435 |
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
| 2436 |
-
std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
|
| 2437 |
-
std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
|
| 2438 |
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
| 2439 |
-
GGML_ASSERT(
|
| 2440 |
-
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
|
| 2441 |
-
for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
|
| 2442 |
-
descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
|
| 2443 |
-
}
|
| 2444 |
-
for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
|
| 2445 |
-
write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
|
| 2446 |
-
}
|
| 2447 |
|
| 2448 |
-
|
|
|
|
|
|
|
| 2449 |
|
| 2450 |
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
|
| 2451 |
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
|
@@ -3127,7 +3127,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub
|
|
| 3127 |
} else if (qx_needs_dequant) {
|
| 3128 |
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
| 3129 |
ggml_vk_sync_buffers(subctx);
|
| 3130 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
| 3131 |
}
|
| 3132 |
if (y_non_contig) {
|
| 3133 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -3316,7 +3316,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
| 3316 |
};
|
| 3317 |
ggml_vk_sync_buffers(subctx);
|
| 3318 |
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
| 3319 |
-
{ { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
| 3320 |
sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
| 3321 |
}
|
| 3322 |
|
|
@@ -3388,7 +3388,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
| 3388 |
// compute
|
| 3389 |
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3390 |
ggml_vk_sync_buffers(subctx);
|
| 3391 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
| 3392 |
}
|
| 3393 |
|
| 3394 |
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3463,7 +3463,8 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
| 3463 |
// compute
|
| 3464 |
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3465 |
ggml_vk_sync_buffers(subctx);
|
| 3466 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
|
|
|
| 3467 |
}
|
| 3468 |
|
| 3469 |
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3638,7 +3639,8 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&
|
|
| 3638 |
} else if (qx_needs_dequant) {
|
| 3639 |
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
| 3640 |
ggml_vk_sync_buffers(subctx);
|
| 3641 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
|
|
|
| 3642 |
}
|
| 3643 |
if (y_non_contig) {
|
| 3644 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
@@ -3838,7 +3840,8 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
|
|
| 3838 |
};
|
| 3839 |
ggml_vk_sync_buffers(subctx);
|
| 3840 |
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
| 3841 |
-
{ { d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
|
|
|
| 3842 |
sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
|
| 3843 |
}
|
| 3844 |
|
|
@@ -4399,7 +4402,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 4399 |
}
|
| 4400 |
|
| 4401 |
ggml_vk_sync_buffers(subctx);
|
| 4402 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4403 |
} else if (op == GGML_OP_ROPE) {
|
| 4404 |
// Empty src2 is possible in rope, but the shader needs a buffer
|
| 4405 |
vk_subbuffer subbuf_z;
|
|
@@ -4410,20 +4413,20 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 4410 |
}
|
| 4411 |
|
| 4412 |
ggml_vk_sync_buffers(subctx);
|
| 4413 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4414 |
} else if (op == GGML_OP_IM2COL) {
|
| 4415 |
// im2col uses only src1 and dst buffers
|
| 4416 |
ggml_vk_sync_buffers(subctx);
|
| 4417 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4418 |
} else if (use_src2) {
|
| 4419 |
ggml_vk_sync_buffers(subctx);
|
| 4420 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4421 |
} else if (use_src1) {
|
| 4422 |
ggml_vk_sync_buffers(subctx);
|
| 4423 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4424 |
} else {
|
| 4425 |
ggml_vk_sync_buffers(subctx);
|
| 4426 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4427 |
}
|
| 4428 |
} else {
|
| 4429 |
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
|
@@ -4460,10 +4463,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
| 4460 |
|
| 4461 |
if (use_src1) {
|
| 4462 |
ggml_vk_sync_buffers(subctx);
|
| 4463 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4464 |
} else {
|
| 4465 |
ggml_vk_sync_buffers(subctx);
|
| 4466 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4467 |
}
|
| 4468 |
}
|
| 4469 |
}
|
|
|
|
| 270 |
vk_buffer buffer;
|
| 271 |
uint64_t offset;
|
| 272 |
uint64_t size;
|
| 273 |
+
|
| 274 |
+
operator vk::DescriptorBufferInfo() const {
|
| 275 |
+
return { buffer->buffer, offset, size };
|
| 276 |
+
}
|
| 277 |
};
|
| 278 |
|
| 279 |
struct vk_semaphore {
|
|
|
|
| 1069 |
|
| 1070 |
static void ggml_vk_sync_buffers(vk_context& ctx) {
|
| 1071 |
VK_LOG_DEBUG("ggml_vk_sync_buffers()");
|
|
|
|
|
|
|
| 1072 |
ctx->s->buffer.pipelineBarrier(
|
| 1073 |
ctx->q->stage_flags,
|
| 1074 |
ctx->q->stage_flags,
|
| 1075 |
{},
|
| 1076 |
+
{ {
|
| 1077 |
+
{vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite},
|
| 1078 |
+
{vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite}
|
| 1079 |
+
} },
|
| 1080 |
{},
|
| 1081 |
{}
|
| 1082 |
);
|
|
|
|
| 2429 |
return s;
|
| 2430 |
}
|
| 2431 |
|
| 2432 |
+
|
| 2433 |
+
|
| 2434 |
+
static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context* ctx, vk_context& subctx, vk_pipeline& pipeline, std::initializer_list<vk::DescriptorBufferInfo> const& descriptor_buffer_infos, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
|
| 2435 |
const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
|
| 2436 |
const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
|
| 2437 |
const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
|
| 2438 |
VK_LOG_DEBUG("ggml_vk_dispatch_pipeline(" << pipeline->name << ", {";
|
| 2439 |
+
for (auto& buffer : descriptor_buffer_infos) {
|
| 2440 |
+
std::cerr << "(" << buffer << ", " << buffer.offset << ", " << buffer.size << "), ";
|
| 2441 |
}
|
| 2442 |
std::cerr << "}, (" << wg0 << "," << wg1 << "," << wg2 << "))");
|
|
|
|
|
|
|
| 2443 |
GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
|
| 2444 |
+
GGML_ASSERT(descriptor_buffer_infos.size() == pipeline->parameter_count);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2445 |
|
| 2446 |
+
vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
|
| 2447 |
+
vk::WriteDescriptorSet write_descriptor_set{ descriptor_set, 0, 0, pipeline->parameter_count, vk::DescriptorType::eStorageBuffer, nullptr, descriptor_buffer_infos.begin() };
|
| 2448 |
+
ctx->device->device.updateDescriptorSets({ write_descriptor_set }, {});
|
| 2449 |
|
| 2450 |
subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
|
| 2451 |
subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
|
|
|
|
| 3127 |
} else if (qx_needs_dequant) {
|
| 3128 |
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
| 3129 |
ggml_vk_sync_buffers(subctx);
|
| 3130 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
| 3131 |
}
|
| 3132 |
if (y_non_contig) {
|
| 3133 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
|
|
| 3316 |
};
|
| 3317 |
ggml_vk_sync_buffers(subctx);
|
| 3318 |
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
| 3319 |
+
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} },
|
| 3320 |
sizeof(vk_mat_vec_push_constants), &pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z });
|
| 3321 |
}
|
| 3322 |
|
|
|
|
| 3388 |
// compute
|
| 3389 |
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3390 |
ggml_vk_sync_buffers(subctx);
|
| 3391 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
| 3392 |
}
|
| 3393 |
|
| 3394 |
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3463 |
// compute
|
| 3464 |
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3465 |
ggml_vk_sync_buffers(subctx);
|
| 3466 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32,
|
| 3467 |
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
| 3468 |
}
|
| 3469 |
|
| 3470 |
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3639 |
} else if (qx_needs_dequant) {
|
| 3640 |
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
| 3641 |
ggml_vk_sync_buffers(subctx);
|
| 3642 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0,
|
| 3643 |
+
{ vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
| 3644 |
}
|
| 3645 |
if (y_non_contig) {
|
| 3646 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
|
|
| 3840 |
};
|
| 3841 |
ggml_vk_sync_buffers(subctx);
|
| 3842 |
ggml_vk_dispatch_pipeline(ctx, subctx, dmmv,
|
| 3843 |
+
{ vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 },
|
| 3844 |
+
vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } },
|
| 3845 |
sizeof(vk_mat_vec_id_push_constants), &pc, { groups_x, (uint32_t)nei0, groups_z });
|
| 3846 |
}
|
| 3847 |
|
|
|
|
| 4402 |
}
|
| 4403 |
|
| 4404 |
ggml_vk_sync_buffers(subctx);
|
| 4405 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4406 |
} else if (op == GGML_OP_ROPE) {
|
| 4407 |
// Empty src2 is possible in rope, but the shader needs a buffer
|
| 4408 |
vk_subbuffer subbuf_z;
|
|
|
|
| 4413 |
}
|
| 4414 |
|
| 4415 |
ggml_vk_sync_buffers(subctx);
|
| 4416 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4417 |
} else if (op == GGML_OP_IM2COL) {
|
| 4418 |
// im2col uses only src1 and dst buffers
|
| 4419 |
ggml_vk_sync_buffers(subctx);
|
| 4420 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4421 |
} else if (use_src2) {
|
| 4422 |
ggml_vk_sync_buffers(subctx);
|
| 4423 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4424 |
} else if (use_src1) {
|
| 4425 |
ggml_vk_sync_buffers(subctx);
|
| 4426 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4427 |
} else {
|
| 4428 |
ggml_vk_sync_buffers(subctx);
|
| 4429 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4430 |
}
|
| 4431 |
} else {
|
| 4432 |
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
|
|
|
| 4463 |
|
| 4464 |
if (use_src1) {
|
| 4465 |
ggml_vk_sync_buffers(subctx);
|
| 4466 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset + y_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4467 |
} else {
|
| 4468 |
ggml_vk_sync_buffers(subctx);
|
| 4469 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset + x_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4470 |
}
|
| 4471 |
}
|
| 4472 |
}
|