Spaces:
Running
Running
Vulkan k-quant mmq and ggml-backend offload functionality (llama/6155)
Browse files* Fix Vulkan no kv offload incoherence
* Add k-quant mul mat mat shaders
* Rework working buffer allocation, reduces vram use noticeably
Clean up cpu assist code, replaced with ggml-backend offload function
* Default to all dedicated GPUs
* Add fallback for integrated GPUs if no dedicated GPUs are found
* Add debug info which device is allocating memory
* Fix Intel dequant issue
Fix validation issue
* Fix Vulkan GGML_OP_GET_ROWS implementation
* Clean up merge artifacts
* Remove Vulkan warning
- ggml-vulkan.cpp +328 -307
- ggml-vulkan.h +0 -11
- ggml.c +0 -35
ggml-vulkan.cpp
CHANGED
|
@@ -9,7 +9,6 @@
|
|
| 9 |
#include <algorithm>
|
| 10 |
#include <cmath>
|
| 11 |
#include <iostream>
|
| 12 |
-
#include <iomanip>
|
| 13 |
#include <limits>
|
| 14 |
#include <tuple>
|
| 15 |
#include <vector>
|
|
@@ -340,8 +339,8 @@ struct ggml_backend_vk_context {
|
|
| 340 |
size_t semaphore_idx, event_idx;
|
| 341 |
ggml_vk_garbage_collector gc;
|
| 342 |
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
| 343 |
-
size_t
|
| 344 |
-
vk_buffer
|
| 345 |
vk::Fence fence;
|
| 346 |
vk_buffer staging;
|
| 347 |
size_t staging_size;
|
|
@@ -809,7 +808,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr
|
|
| 809 |
|
| 810 |
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
| 811 |
#ifdef GGML_VULKAN_DEBUG
|
| 812 |
-
std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
| 813 |
#endif
|
| 814 |
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
| 815 |
|
|
@@ -998,6 +997,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 998 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 999 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1000 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1001 |
|
| 1002 |
if (device->fp16) {
|
| 1003 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
|
@@ -1055,6 +1059,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1055 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1056 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1057 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1058 |
} else {
|
| 1059 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
| 1060 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
|
@@ -1111,6 +1150,41 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1111 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1112 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1113 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1114 |
}
|
| 1115 |
|
| 1116 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
@@ -1139,19 +1213,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1139 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 1140 |
|
| 1141 |
// get_rows
|
| 1142 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
| 1143 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
| 1144 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
| 1145 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
| 1146 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
| 1147 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[
|
| 1148 |
-
|
| 1149 |
-
|
| 1150 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
| 1151 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
| 1152 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
| 1153 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
| 1154 |
-
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[
|
|
|
|
|
|
|
| 1155 |
|
| 1156 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
| 1157 |
|
|
@@ -1341,7 +1417,33 @@ void ggml_vk_instance_init() {
|
|
| 1341 |
vk_instance.device_indices.push_back(tmp);
|
| 1342 |
}
|
| 1343 |
} else {
|
| 1344 |
-
vk_instance.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1345 |
}
|
| 1346 |
|
| 1347 |
vk_instance_initialized = true;
|
|
@@ -1567,6 +1669,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
| 1567 |
|
| 1568 |
switch (src0_type) {
|
| 1569 |
case GGML_TYPE_Q4_0:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1570 |
break;
|
| 1571 |
default:
|
| 1572 |
return nullptr;
|
|
@@ -2034,7 +2145,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
|
|
| 2034 |
ggml_vk_submit(subctx, ctx->fence);
|
| 2035 |
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
| 2036 |
ctx->device->device.resetFences({ ctx->fence });
|
| 2037 |
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
| 2038 |
}
|
| 2039 |
}
|
| 2040 |
|
|
@@ -2131,7 +2241,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
|
|
| 2131 |
for (auto& cpy : subctx->out_memcpys) {
|
| 2132 |
memcpy(cpy.dst, cpy.src, cpy.n);
|
| 2133 |
}
|
| 2134 |
-
ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
|
| 2135 |
}
|
| 2136 |
}
|
| 2137 |
|
|
@@ -2298,6 +2407,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
| 2298 |
return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
|
| 2299 |
case VK_VENDOR_ID_INTEL:
|
| 2300 |
return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
|
|
|
|
|
|
|
| 2301 |
}
|
| 2302 |
|
| 2303 |
if (m <= 32 || n <= 32) {
|
|
@@ -2423,11 +2534,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
| 2423 |
src1_uma = d_Qy != nullptr;
|
| 2424 |
}
|
| 2425 |
|
| 2426 |
-
const bool
|
| 2427 |
-
const bool
|
| 2428 |
-
|
| 2429 |
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
| 2430 |
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
| 2431 |
|
| 2432 |
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
| 2433 |
|
|
@@ -2469,16 +2577,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
| 2469 |
uint64_t x_buf_offset = 0;
|
| 2470 |
vk_buffer d_Y;
|
| 2471 |
uint64_t y_buf_offset = 0;
|
| 2472 |
-
if (
|
| 2473 |
-
d_Qx = ctx->prealloc_qx;
|
| 2474 |
-
} else if (!src0_uma) {
|
| 2475 |
d_Qx = extra_src0->buffer_gpu.lock();
|
| 2476 |
qx_buf_offset = extra_src0->offset;
|
| 2477 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2478 |
}
|
| 2479 |
-
if (
|
| 2480 |
-
d_Qy = ctx->prealloc_qy;
|
| 2481 |
-
} else if (!src1_uma) {
|
| 2482 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2483 |
qy_buf_offset = extra_src1->offset;
|
| 2484 |
GGML_ASSERT(d_Qy != nullptr);
|
|
@@ -2530,33 +2634,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
| 2530 |
|
| 2531 |
if (x_non_contig) {
|
| 2532 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
| 2533 |
-
} else if (
|
| 2534 |
-
|
| 2535 |
-
|
| 2536 |
-
|
| 2537 |
-
ctx->staging_offset = qx_sz * ne02 * ne03;
|
| 2538 |
-
}
|
| 2539 |
-
|
| 2540 |
-
if (qx_needs_dequant) {
|
| 2541 |
-
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
| 2542 |
-
ggml_vk_sync_buffers(subctx);
|
| 2543 |
-
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
| 2544 |
-
}
|
| 2545 |
}
|
| 2546 |
if (y_non_contig) {
|
| 2547 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
| 2548 |
-
} else if (load_y) {
|
| 2549 |
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
| 2550 |
}
|
| 2551 |
|
| 2552 |
uint32_t stride_batch_x = ne00*ne01;
|
| 2553 |
uint32_t stride_batch_y = ne10*ne11;
|
| 2554 |
|
| 2555 |
-
if (!ggml_vk_dim01_contiguous(src0) && !
|
| 2556 |
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
| 2557 |
}
|
| 2558 |
|
| 2559 |
-
if (!ggml_vk_dim01_contiguous(src1) && !
|
| 2560 |
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
| 2561 |
}
|
| 2562 |
|
|
@@ -2616,11 +2710,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
| 2616 |
src1_uma = d_Qy != nullptr;
|
| 2617 |
}
|
| 2618 |
|
| 2619 |
-
const bool
|
| 2620 |
-
const bool
|
| 2621 |
-
|
| 2622 |
-
const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0);
|
| 2623 |
-
const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1);
|
| 2624 |
|
| 2625 |
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
| 2626 |
|
|
@@ -2644,16 +2735,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
| 2644 |
uint64_t x_buf_offset = 0;
|
| 2645 |
vk_buffer d_Y;
|
| 2646 |
uint64_t y_buf_offset = 0;
|
| 2647 |
-
if
|
| 2648 |
-
d_Qx = ctx->prealloc_qx;
|
| 2649 |
-
} else if(!src1_uma) {
|
| 2650 |
d_Qx = extra_src0->buffer_gpu.lock();
|
| 2651 |
qx_buf_offset = extra_src0->offset;
|
| 2652 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2653 |
}
|
| 2654 |
-
if
|
| 2655 |
-
d_Qy = ctx->prealloc_qy;
|
| 2656 |
-
} else if(!src1_uma) {
|
| 2657 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2658 |
qy_buf_offset = extra_src1->offset;
|
| 2659 |
GGML_ASSERT(d_Qy != nullptr);
|
|
@@ -2700,15 +2787,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
| 2700 |
if (x_non_contig) {
|
| 2701 |
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
| 2702 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
| 2703 |
-
} else if (load_x) {
|
| 2704 |
-
// copy data to device
|
| 2705 |
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0));
|
| 2706 |
}
|
| 2707 |
if (y_non_contig) {
|
| 2708 |
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
| 2709 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
| 2710 |
-
} else if (load_y) {
|
| 2711 |
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1));
|
| 2712 |
}
|
| 2713 |
|
| 2714 |
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
|
@@ -2789,8 +2871,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
| 2789 |
src1_uma = d_Qy != nullptr;
|
| 2790 |
}
|
| 2791 |
|
| 2792 |
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
| 2793 |
-
|
| 2794 |
const uint64_t x_ne = ne00 * ne01 * ne02;
|
| 2795 |
const uint64_t y_ne = ne10 * ne11 * ne12;
|
| 2796 |
const uint64_t d_ne = ne01 * ne11 * ne12;
|
|
@@ -2805,9 +2885,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
| 2805 |
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
| 2806 |
const uint64_t qx_buf_offset = extra_src0->offset;
|
| 2807 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2808 |
-
if (
|
| 2809 |
-
d_Qy = ctx->prealloc_qy;
|
| 2810 |
-
} else if (!src1_uma) {
|
| 2811 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2812 |
qy_buf_offset = extra_src1->offset;
|
| 2813 |
GGML_ASSERT(d_Qx != nullptr);
|
|
@@ -2822,10 +2900,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
| 2822 |
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 2823 |
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
| 2824 |
|
| 2825 |
-
if (load_y) {
|
| 2826 |
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
| 2827 |
-
}
|
| 2828 |
-
|
| 2829 |
// compute
|
| 2830 |
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 2831 |
ggml_vk_sync_buffers(subctx);
|
|
@@ -2881,8 +2955,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
| 2881 |
src1_uma = d_Qy != nullptr;
|
| 2882 |
}
|
| 2883 |
|
| 2884 |
-
const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
| 2885 |
-
|
| 2886 |
const uint64_t d_ne = ne01 * ne11 * ne12;
|
| 2887 |
|
| 2888 |
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
|
@@ -2898,9 +2970,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
| 2898 |
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
| 2899 |
const uint64_t qx_buf_offset = extra_src0->offset;
|
| 2900 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2901 |
-
if (
|
| 2902 |
-
d_Qy = ctx->prealloc_qy;
|
| 2903 |
-
} else {
|
| 2904 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2905 |
qy_buf_offset = extra_src1->offset;
|
| 2906 |
GGML_ASSERT(d_Qx != nullptr);
|
|
@@ -2915,10 +2985,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
| 2915 |
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 2916 |
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
| 2917 |
|
| 2918 |
-
if (load_y) {
|
| 2919 |
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1));
|
| 2920 |
-
}
|
| 2921 |
-
|
| 2922 |
// compute
|
| 2923 |
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 2924 |
ggml_vk_sync_buffers(subctx);
|
|
@@ -3174,7 +3240,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3174 |
}
|
| 3175 |
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
| 3176 |
#endif
|
| 3177 |
-
GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT
|
| 3178 |
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
| 3179 |
GGML_ASSERT(dst->extra != nullptr);
|
| 3180 |
const uint64_t ne00 = src0->ne[0];
|
|
@@ -3242,11 +3308,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3242 |
}
|
| 3243 |
}
|
| 3244 |
|
| 3245 |
-
|
| 3246 |
-
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma;
|
| 3247 |
-
const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma;
|
| 3248 |
-
|
| 3249 |
-
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
| 3250 |
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
| 3251 |
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
| 3252 |
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
|
@@ -3261,55 +3323,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3261 |
GGML_ASSERT(d_D != nullptr);
|
| 3262 |
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 3263 |
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
| 3264 |
-
if
|
| 3265 |
-
d_X = ctx->prealloc_qx;
|
| 3266 |
-
} else if(!src0_uma) {
|
| 3267 |
d_X = extra_src0->buffer_gpu.lock();
|
| 3268 |
x_buf_offset = extra_src0->offset;
|
| 3269 |
GGML_ASSERT(d_X != nullptr);
|
| 3270 |
}
|
| 3271 |
-
if (
|
| 3272 |
-
d_Y = ctx->prealloc_qy;
|
| 3273 |
-
} else if (use_src1 && !src1_uma) {
|
| 3274 |
d_Y = extra_src1->buffer_gpu.lock();
|
| 3275 |
y_buf_offset = extra_src1->offset;
|
| 3276 |
GGML_ASSERT(d_Y != nullptr);
|
| 3277 |
}
|
| 3278 |
|
| 3279 |
-
GGML_ASSERT(!transfer_src2);
|
| 3280 |
if (use_src2 && !src2_uma) {
|
| 3281 |
d_Z = extra_src2->buffer_gpu.lock();
|
| 3282 |
z_buf_offset = extra_src2->offset;
|
| 3283 |
GGML_ASSERT(d_Z != nullptr);
|
| 3284 |
}
|
| 3285 |
|
| 3286 |
-
if (op == GGML_OP_CPY) {
|
| 3287 |
-
GGML_ASSERT(!transfer_src0);
|
| 3288 |
-
GGML_ASSERT(!transfer_src1);
|
| 3289 |
x_sz = ggml_nbytes(src0);
|
|
|
|
| 3290 |
d_sz = ggml_nbytes(dst);
|
| 3291 |
|
| 3292 |
-
if (
|
| 3293 |
x_sz = VK_WHOLE_SIZE;
|
| 3294 |
}
|
| 3295 |
-
if (
|
|
|
|
|
|
|
|
|
|
| 3296 |
d_sz = VK_WHOLE_SIZE;
|
| 3297 |
}
|
| 3298 |
}
|
| 3299 |
|
| 3300 |
std::array<uint32_t, 3> elements;
|
| 3301 |
|
| 3302 |
-
// copy src0 to device
|
| 3303 |
-
if (transfer_src0) {
|
| 3304 |
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0));
|
| 3305 |
-
ctx->staging_offset = x_sz * ne02 * ne03;
|
| 3306 |
-
}
|
| 3307 |
-
if (transfer_src1) {
|
| 3308 |
-
ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1));
|
| 3309 |
-
}
|
| 3310 |
-
|
| 3311 |
// Single call if dimension 2 is contiguous
|
| 3312 |
-
if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
| 3313 |
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
| 3314 |
|
| 3315 |
switch (dst->op) {
|
|
@@ -3322,16 +3372,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3322 |
case GGML_OP_ROPE:
|
| 3323 |
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
| 3324 |
break;
|
|
|
|
|
|
|
|
|
|
| 3325 |
default:
|
| 3326 |
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
| 3327 |
break;
|
| 3328 |
}
|
| 3329 |
|
| 3330 |
-
if (op != GGML_OP_CPY) {
|
| 3331 |
if (x_sz != VK_WHOLE_SIZE) {
|
| 3332 |
x_sz *= ne02 * ne03;
|
| 3333 |
}
|
| 3334 |
-
if (y_sz != VK_WHOLE_SIZE) {
|
| 3335 |
y_sz *= ne12 * ne13;
|
| 3336 |
}
|
| 3337 |
if (d_sz != VK_WHOLE_SIZE) {
|
|
@@ -3386,6 +3439,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3386 |
case GGML_OP_ROPE:
|
| 3387 |
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
| 3388 |
break;
|
|
|
|
|
|
|
|
|
|
| 3389 |
default:
|
| 3390 |
elements = { (uint32_t)ne0, 1, 1 };
|
| 3391 |
break;
|
|
@@ -3420,7 +3476,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3420 |
}
|
| 3421 |
|
| 3422 |
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3423 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3424 |
}
|
| 3425 |
|
| 3426 |
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3576,9 +3643,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
|
|
| 3576 |
if (is_neox) {
|
| 3577 |
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 3578 |
const float inv_ndims = -1.0f / n_dims;
|
| 3579 |
-
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims });
|
| 3580 |
} else {
|
| 3581 |
-
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f });
|
| 3582 |
}
|
| 3583 |
}
|
| 3584 |
|
|
@@ -3587,16 +3654,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
|
|
| 3587 |
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
| 3588 |
}
|
| 3589 |
|
| 3590 |
-
static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
| 3591 |
-
// If backend is CPU, data from src0 has to be copied off the device
|
| 3592 |
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
| 3593 |
-
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
| 3594 |
-
vk_buffer d_D = extra_src0->buffer_gpu.lock();
|
| 3595 |
-
ggml_vk_sync_buffers(subctx);
|
| 3596 |
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size);
|
| 3597 |
-
}
|
| 3598 |
-
}
|
| 3599 |
-
|
| 3600 |
#ifdef GGML_VULKAN_RUN_TESTS
|
| 3601 |
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
| 3602 |
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
|
@@ -3619,6 +3676,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
|
|
| 3619 |
val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
|
| 3620 |
} else if (type == GGML_TYPE_F16) {
|
| 3621 |
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
|
|
|
|
|
|
|
| 3622 |
}
|
| 3623 |
fprintf(stderr, "% 7.2f ", val);
|
| 3624 |
} else {
|
|
@@ -3920,6 +3979,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
|
|
| 3920 |
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
| 3921 |
} else if (tensor->type == GGML_TYPE_F16) {
|
| 3922 |
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
|
|
|
|
|
|
| 3923 |
}
|
| 3924 |
fprintf(stderr, "% 7.2f ", val);
|
| 3925 |
} else {
|
|
@@ -4335,7 +4396,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
| 4335 |
|
| 4336 |
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
| 4337 |
|
| 4338 |
-
if (avg_err > 0.
|
| 4339 |
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
| 4340 |
std::cerr << "Actual result: " << std::endl << std::endl;
|
| 4341 |
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
|
@@ -4385,27 +4446,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor)
|
|
| 4385 |
return extra;
|
| 4386 |
}
|
| 4387 |
|
| 4388 |
-
static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) {
|
| 4389 |
-
return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID;
|
| 4390 |
-
}
|
| 4391 |
-
|
| 4392 |
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
| 4393 |
#ifdef GGML_VULKAN_DEBUG
|
| 4394 |
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
| 4395 |
#endif
|
| 4396 |
-
|
| 4397 |
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
| 4398 |
-
|| (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU));
|
| 4399 |
-
|
| 4400 |
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) {
|
| 4401 |
return;
|
| 4402 |
}
|
| 4403 |
|
| 4404 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
| 4405 |
-
if (extra == nullptr) {
|
| 4406 |
-
// Workaround for CPU backend BLAS matmul calls
|
| 4407 |
-
extra = ggml_vk_tensor_create_extra(node);
|
| 4408 |
-
}
|
| 4409 |
|
| 4410 |
ggml_tensor * src0 = node->src[0];
|
| 4411 |
ggml_tensor * src1 = node->src[1];
|
|
@@ -4425,7 +4474,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
| 4425 |
const int64_t ne22 = node->ne[2];
|
| 4426 |
const int64_t ne23 = node->ne[3];
|
| 4427 |
|
| 4428 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4429 |
|
| 4430 |
int split_k;
|
| 4431 |
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
|
@@ -4437,10 +4497,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
| 4437 |
const uint32_t y_ne = ne10 * ne11;
|
| 4438 |
const uint32_t d_ne = ne20 * ne21;
|
| 4439 |
|
| 4440 |
-
const uint64_t
|
| 4441 |
-
const uint64_t
|
| 4442 |
-
const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
| 4443 |
-
const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
| 4444 |
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
| 4445 |
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
| 4446 |
|
|
@@ -4483,12 +4541,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
| 4483 |
break;
|
| 4484 |
case GGML_OP_MUL_MAT:
|
| 4485 |
case GGML_OP_MUL_MAT_ID:
|
| 4486 |
-
if (ctx->prealloc_size_qx < qx_sz) {
|
| 4487 |
-
ctx->prealloc_size_qx = qx_sz;
|
| 4488 |
-
}
|
| 4489 |
-
if (ctx->prealloc_size_qy < qy_sz) {
|
| 4490 |
-
ctx->prealloc_size_qy = qy_sz;
|
| 4491 |
-
}
|
| 4492 |
if (ctx->prealloc_size_x < x_sz) {
|
| 4493 |
ctx->prealloc_size_x = x_sz;
|
| 4494 |
}
|
|
@@ -4512,7 +4564,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
| 4512 |
return;
|
| 4513 |
}
|
| 4514 |
#ifdef GGML_VULKAN_DEBUG
|
| 4515 |
-
std::cerr << "ggml_vk_preallocate_buffers(
|
| 4516 |
#endif
|
| 4517 |
#if defined(GGML_VULKAN_RUN_TESTS)
|
| 4518 |
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
|
@@ -4575,6 +4627,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
| 4575 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
| 4576 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
| 4577 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4578 |
std::cerr << std::endl;
|
| 4579 |
|
| 4580 |
const std::vector<size_t> vals {
|
|
@@ -4614,20 +4701,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
| 4614 |
GGML_ASSERT(false);
|
| 4615 |
#endif
|
| 4616 |
|
| 4617 |
-
if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) {
|
| 4618 |
-
// Resize buffer
|
| 4619 |
-
if (ctx->prealloc_qx != nullptr) {
|
| 4620 |
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
| 4621 |
-
}
|
| 4622 |
-
ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx);
|
| 4623 |
-
}
|
| 4624 |
-
if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) {
|
| 4625 |
-
// Resize buffer
|
| 4626 |
-
if (ctx->prealloc_qy != nullptr) {
|
| 4627 |
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
| 4628 |
-
}
|
| 4629 |
-
ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy);
|
| 4630 |
-
}
|
| 4631 |
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
| 4632 |
// Resize buffer
|
| 4633 |
if (ctx->prealloc_x != nullptr) {
|
|
@@ -4661,11 +4734,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
| 4661 |
}
|
| 4662 |
|
| 4663 |
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
| 4664 |
-
|
| 4665 |
-
|| (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
| 4666 |
-
|| (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
| 4667 |
-
|
| 4668 |
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) {
|
| 4669 |
return;
|
| 4670 |
}
|
| 4671 |
|
|
@@ -4693,7 +4762,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 4693 |
}
|
| 4694 |
break;
|
| 4695 |
case GGML_OP_REPEAT:
|
| 4696 |
-
|
| 4697 |
case GGML_OP_ADD:
|
| 4698 |
case GGML_OP_MUL:
|
| 4699 |
case GGML_OP_SCALE:
|
|
@@ -4717,10 +4786,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 4717 |
case GGML_OP_ARGSORT:
|
| 4718 |
break;
|
| 4719 |
default:
|
| 4720 |
-
|
| 4721 |
-
|
| 4722 |
-
GGML_ASSERT(false);
|
| 4723 |
-
}
|
| 4724 |
return;
|
| 4725 |
}
|
| 4726 |
|
|
@@ -4769,8 +4836,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 4769 |
case GGML_OP_PERMUTE:
|
| 4770 |
case GGML_OP_TRANSPOSE:
|
| 4771 |
case GGML_OP_NONE:
|
| 4772 |
-
ggml_vk_nop(ctx, ctx->compute_ctx, src0, node);
|
| 4773 |
-
|
| 4774 |
break;
|
| 4775 |
case GGML_OP_NORM:
|
| 4776 |
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
|
@@ -4837,11 +4902,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 4837 |
}
|
| 4838 |
|
| 4839 |
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
| 4840 |
-
|
| 4841 |
-
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
| 4842 |
-
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
| 4843 |
-
|
| 4844 |
-
if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) {
|
| 4845 |
return false;
|
| 4846 |
}
|
| 4847 |
|
|
@@ -4884,10 +4945,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 4884 |
break;
|
| 4885 |
case GGML_OP_MUL_MAT:
|
| 4886 |
case GGML_OP_MUL_MAT_ID:
|
| 4887 |
-
if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
| 4888 |
-
return false;
|
| 4889 |
-
}
|
| 4890 |
-
|
| 4891 |
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 4892 |
|
| 4893 |
break;
|
|
@@ -5001,8 +5058,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
| 5001 |
#endif
|
| 5002 |
ggml_vk_graph_cleanup(ctx);
|
| 5003 |
|
| 5004 |
-
ggml_vk_destroy_buffer(ctx->prealloc_qx);
|
| 5005 |
-
ggml_vk_destroy_buffer(ctx->prealloc_qy);
|
| 5006 |
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
| 5007 |
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
| 5008 |
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
|
@@ -5013,8 +5068,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
|
|
| 5013 |
ggml_vk_destroy_buffer(buffer);
|
| 5014 |
}
|
| 5015 |
|
| 5016 |
-
ctx->prealloc_size_qx = 0;
|
| 5017 |
-
ctx->prealloc_size_qy = 0;
|
| 5018 |
ctx->prealloc_size_x = 0;
|
| 5019 |
ctx->prealloc_size_y = 0;
|
| 5020 |
ctx->prealloc_size_split_k = 0;
|
|
@@ -5045,80 +5098,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript
|
|
| 5045 |
snprintf(description, description_size, "%s", props.deviceName.data());
|
| 5046 |
}
|
| 5047 |
|
| 5048 |
-
// CPU assist interface
|
| 5049 |
-
|
| 5050 |
-
void ggml_vk_init_cpu_assist() {
|
| 5051 |
-
ggml_vk_instance_init();
|
| 5052 |
-
|
| 5053 |
-
std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl;
|
| 5054 |
-
|
| 5055 |
-
for (int i = 0; i < ggml_vk_get_device_count(); i++) {
|
| 5056 |
-
ggml_vk_print_gpu_info(i);
|
| 5057 |
-
}
|
| 5058 |
-
// Initialize the first backend to make sure CPU matrix multiplications can be offloaded.
|
| 5059 |
-
ggml_backend_vk_init(0);
|
| 5060 |
-
}
|
| 5061 |
-
|
| 5062 |
-
void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) {
|
| 5063 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
| 5064 |
-
|
| 5065 |
-
if (!ctx->initialized) {
|
| 5066 |
-
return;
|
| 5067 |
-
}
|
| 5068 |
-
|
| 5069 |
-
ggml_vk_preallocate_buffers_graph(ctx, node);
|
| 5070 |
-
}
|
| 5071 |
-
|
| 5072 |
-
void ggml_vk_preallocate_buffers_cpu_assist() {
|
| 5073 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
| 5074 |
-
|
| 5075 |
-
if (!ctx->initialized) {
|
| 5076 |
-
return;
|
| 5077 |
-
}
|
| 5078 |
-
|
| 5079 |
-
ggml_vk_preallocate_buffers(ctx);
|
| 5080 |
-
}
|
| 5081 |
-
|
| 5082 |
-
void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) {
|
| 5083 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
| 5084 |
-
|
| 5085 |
-
if (!ctx->initialized) {
|
| 5086 |
-
return;
|
| 5087 |
-
}
|
| 5088 |
-
|
| 5089 |
-
ggml_vk_build_graph(ctx, node, last_node);
|
| 5090 |
-
}
|
| 5091 |
-
|
| 5092 |
-
bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){
|
| 5093 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
| 5094 |
-
|
| 5095 |
-
if (!ctx->initialized) {
|
| 5096 |
-
return false;
|
| 5097 |
-
}
|
| 5098 |
-
|
| 5099 |
-
return ggml_vk_compute_forward(ctx, params, tensor);
|
| 5100 |
-
}
|
| 5101 |
-
|
| 5102 |
-
void ggml_vk_graph_cleanup_cpu_assist() {
|
| 5103 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
| 5104 |
-
|
| 5105 |
-
if (!ctx->initialized) {
|
| 5106 |
-
return;
|
| 5107 |
-
}
|
| 5108 |
-
|
| 5109 |
-
ggml_vk_graph_cleanup(ctx);
|
| 5110 |
-
}
|
| 5111 |
-
|
| 5112 |
-
void ggml_vk_free_cpu_assist() {
|
| 5113 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
| 5114 |
-
|
| 5115 |
-
if (!ctx->initialized || vk_instance.backends[0] == nullptr) {
|
| 5116 |
-
return;
|
| 5117 |
-
}
|
| 5118 |
-
|
| 5119 |
-
ggml_backend_vk_free(vk_instance.backends[0]);
|
| 5120 |
-
}
|
| 5121 |
-
|
| 5122 |
// backend interface
|
| 5123 |
|
| 5124 |
#define UNUSED GGML_UNUSED
|
|
@@ -5330,16 +5309,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
|
|
| 5330 |
/* .is_host = */ NULL,
|
| 5331 |
};
|
| 5332 |
|
| 5333 |
-
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t
|
| 5334 |
#ifdef GGML_VULKAN_DEBUG
|
| 5335 |
-
std::cerr << "ggml_backend_vk_buffer_type(" <<
|
| 5336 |
#endif
|
| 5337 |
|
| 5338 |
-
GGML_ASSERT(
|
| 5339 |
|
| 5340 |
-
ggml_backend_vk_init(
|
| 5341 |
|
| 5342 |
-
return &vk_instance.buffer_types[
|
| 5343 |
}
|
| 5344 |
|
| 5345 |
// host buffer type
|
|
@@ -5508,7 +5487,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
|
|
| 5508 |
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
| 5509 |
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
| 5510 |
|
| 5511 |
-
ggml_vk_buffer_copy_async(ctx->transfer_ctx,
|
| 5512 |
return true;
|
| 5513 |
}
|
| 5514 |
|
|
@@ -5542,6 +5521,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
| 5542 |
}
|
| 5543 |
|
| 5544 |
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
|
|
|
|
|
|
|
|
| 5545 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 5546 |
|
| 5547 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
@@ -5602,8 +5584,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 5602 |
}
|
| 5603 |
break;
|
| 5604 |
case GGML_OP_MUL_MAT:
|
| 5605 |
-
case GGML_OP_MUL_MAT_ID:
|
| 5606 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5607 |
struct ggml_tensor * a;
|
| 5608 |
struct ggml_tensor * b;
|
| 5609 |
if (op->op == GGML_OP_MUL_MAT) {
|
|
@@ -5618,25 +5617,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 5618 |
}
|
| 5619 |
return true;
|
| 5620 |
} break;
|
| 5621 |
-
|
| 5622 |
-
|
| 5623 |
-
|
| 5624 |
-
|
| 5625 |
-
|
| 5626 |
-
|
| 5627 |
-
|
| 5628 |
-
|
| 5629 |
-
|
| 5630 |
-
|
| 5631 |
-
|
| 5632 |
-
|
| 5633 |
-
|
| 5634 |
-
|
| 5635 |
-
|
| 5636 |
case GGML_OP_CPY:
|
|
|
|
| 5637 |
{
|
| 5638 |
ggml_type src0_type = op->src[0]->type;
|
| 5639 |
-
ggml_type src1_type = op->src[1]->type;
|
| 5640 |
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
| 5641 |
return true;
|
| 5642 |
}
|
|
@@ -5648,7 +5648,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 5648 |
}
|
| 5649 |
return false;
|
| 5650 |
} break;
|
| 5651 |
-
case GGML_OP_DUP:
|
| 5652 |
// case GGML_OP_REPEAT:
|
| 5653 |
// {
|
| 5654 |
// ggml_type src0_type = op->src[0]->type;
|
|
@@ -5685,6 +5684,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
|
| 5685 |
UNUSED(backend);
|
| 5686 |
}
|
| 5687 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5688 |
// TODO: enable async and synchronize
|
| 5689 |
static ggml_backend_i ggml_backend_vk_interface = {
|
| 5690 |
/* .get_name = */ ggml_backend_vk_name,
|
|
@@ -5699,7 +5712,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
|
| 5699 |
/* .graph_plan_compute = */ NULL,
|
| 5700 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
| 5701 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
| 5702 |
-
/* .offload_op = */
|
| 5703 |
/* .event_new = */ NULL,
|
| 5704 |
/* .event_free = */ NULL,
|
| 5705 |
/* .event_record = */ NULL,
|
|
@@ -5712,22 +5725,22 @@ static ggml_guid_t ggml_backend_vk_guid() {
|
|
| 5712 |
return &guid;
|
| 5713 |
}
|
| 5714 |
|
| 5715 |
-
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t
|
| 5716 |
-
if (vk_instance.initialized[
|
| 5717 |
-
return vk_instance.backends[
|
| 5718 |
}
|
| 5719 |
#ifdef GGML_VULKAN_DEBUG
|
| 5720 |
-
std::cerr << "ggml_backend_vk_init(" <<
|
| 5721 |
#endif
|
| 5722 |
|
| 5723 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[
|
| 5724 |
-
ggml_vk_init(ctx,
|
| 5725 |
-
ctx->name = GGML_VK_NAME + std::to_string(
|
| 5726 |
-
vk_instance.buffer_types[
|
| 5727 |
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
| 5728 |
/* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
|
| 5729 |
};
|
| 5730 |
-
vk_instance.initialized[
|
| 5731 |
|
| 5732 |
ggml_backend_t vk_backend = new ggml_backend {
|
| 5733 |
/* .guid = */ ggml_backend_vk_guid(),
|
|
@@ -5735,7 +5748,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) {
|
|
| 5735 |
/* .context = */ &vk_instance.contexts[ctx->idx],
|
| 5736 |
};
|
| 5737 |
|
| 5738 |
-
vk_instance.backends[
|
| 5739 |
|
| 5740 |
return vk_backend;
|
| 5741 |
}
|
|
@@ -5779,10 +5792,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo
|
|
| 5779 |
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
| 5780 |
|
| 5781 |
GGML_CALL int ggml_backend_vk_reg_devices() {
|
| 5782 |
-
|
|
|
|
|
|
|
| 5783 |
char name[128];
|
| 5784 |
-
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME,
|
| 5785 |
-
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(
|
| 5786 |
}
|
| 5787 |
return vk_instance.device_indices.size();
|
| 5788 |
}
|
|
@@ -5866,6 +5881,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
|
|
| 5866 |
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
| 5867 |
} else if (tensor->type == GGML_TYPE_F16) {
|
| 5868 |
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
|
|
|
|
|
|
| 5869 |
}
|
| 5870 |
fprintf(stderr, "% 7.2f ", val);
|
| 5871 |
} else {
|
|
@@ -5960,6 +5977,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 5960 |
return;
|
| 5961 |
}
|
| 5962 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5963 |
ggml_tensor * src0 = tensor->src[0];
|
| 5964 |
ggml_tensor * src1 = tensor->src[1];
|
| 5965 |
ggml_tensor * src2 = tensor->src[2];
|
|
@@ -6219,6 +6240,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6219 |
tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
|
| 6220 |
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
| 6221 |
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
|
|
|
|
|
|
| 6222 |
} else {
|
| 6223 |
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
| 6224 |
GGML_ASSERT(false);
|
|
@@ -6269,6 +6292,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6269 |
return;
|
| 6270 |
}
|
| 6271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6272 |
ggml_tensor * src0 = tensor->src[0];
|
| 6273 |
ggml_tensor * src1 = tensor->src[1];
|
| 6274 |
|
|
@@ -6412,10 +6439,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6412 |
free(tensor_data);
|
| 6413 |
}
|
| 6414 |
}
|
| 6415 |
-
|
| 6416 |
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
| 6417 |
-
ggml_backend_vk_context * ctx = &vk_instance.contexts[0];
|
| 6418 |
-
|
| 6419 |
-
ggml_vk_check_results_0(ctx, params, tensor);
|
| 6420 |
-
}
|
| 6421 |
#endif
|
|
|
|
| 9 |
#include <algorithm>
|
| 10 |
#include <cmath>
|
| 11 |
#include <iostream>
|
|
|
|
| 12 |
#include <limits>
|
| 13 |
#include <tuple>
|
| 14 |
#include <vector>
|
|
|
|
| 339 |
size_t semaphore_idx, event_idx;
|
| 340 |
ggml_vk_garbage_collector gc;
|
| 341 |
std::vector<std::tuple<void*, size_t, vk_buffer>> pinned_memory;
|
| 342 |
+
size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k;
|
| 343 |
+
vk_buffer prealloc_x, prealloc_y, prealloc_split_k;
|
| 344 |
vk::Fence fence;
|
| 345 |
vk_buffer staging;
|
| 346 |
size_t staging_size;
|
|
|
|
| 808 |
|
| 809 |
static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) {
|
| 810 |
#ifdef GGML_VULKAN_DEBUG
|
| 811 |
+
std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl;
|
| 812 |
#endif
|
| 813 |
vk_buffer buf = std::make_shared<vk_buffer_struct>();
|
| 814 |
|
|
|
|
| 997 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 998 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 999 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1000 |
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1001 |
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1002 |
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1003 |
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1004 |
+
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1005 |
|
| 1006 |
if (device->fp16) {
|
| 1007 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
|
|
|
| 1059 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1060 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1061 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1062 |
+
|
| 1063 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1064 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1065 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1066 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1067 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1068 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1069 |
+
|
| 1070 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1071 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1072 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1073 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1074 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1075 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1076 |
+
|
| 1077 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1078 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1079 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1080 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1081 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1082 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1083 |
+
|
| 1084 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1085 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1086 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1087 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1088 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1089 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1090 |
+
|
| 1091 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1092 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1093 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1094 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1095 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1096 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1097 |
} else {
|
| 1098 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
|
| 1099 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
|
|
|
|
| 1150 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1151 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1152 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1153 |
+
|
| 1154 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1155 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1156 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1157 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1158 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1159 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1160 |
+
|
| 1161 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1162 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1163 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1164 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1165 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1166 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1167 |
+
|
| 1168 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1169 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1170 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1171 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1172 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1173 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1174 |
+
|
| 1175 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1176 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1177 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1178 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1179 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1180 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1181 |
+
|
| 1182 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1183 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1184 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1185 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align);
|
| 1186 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align);
|
| 1187 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align);
|
| 1188 |
}
|
| 1189 |
|
| 1190 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
|
|
|
|
| 1213 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
|
| 1214 |
|
| 1215 |
// get_rows
|
| 1216 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 1217 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 1218 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1219 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1220 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1221 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1222 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1223 |
+
|
| 1224 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 1225 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1);
|
| 1226 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1227 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1228 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1229 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1230 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1);
|
| 1231 |
|
| 1232 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
|
| 1233 |
|
|
|
|
| 1417 |
vk_instance.device_indices.push_back(tmp);
|
| 1418 |
}
|
| 1419 |
} else {
|
| 1420 |
+
std::vector<vk::PhysicalDevice> devices = vk_instance.instance.enumeratePhysicalDevices();
|
| 1421 |
+
|
| 1422 |
+
// Make sure at least one device exists
|
| 1423 |
+
if (devices.empty()) {
|
| 1424 |
+
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
|
| 1425 |
+
GGML_ASSERT(false);
|
| 1426 |
+
}
|
| 1427 |
+
|
| 1428 |
+
// Default to using all dedicated GPUs
|
| 1429 |
+
for (size_t i = 0; i < devices.size(); i++) {
|
| 1430 |
+
vk::PhysicalDeviceProperties props = devices[i].getProperties();
|
| 1431 |
+
|
| 1432 |
+
if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
|
| 1433 |
+
vk_instance.device_indices.push_back(i);
|
| 1434 |
+
}
|
| 1435 |
+
}
|
| 1436 |
+
|
| 1437 |
+
// If no dedicated GPUs found, fall back to GPU 0
|
| 1438 |
+
if (vk_instance.device_indices.empty()) {
|
| 1439 |
+
vk_instance.device_indices.push_back(0);
|
| 1440 |
+
}
|
| 1441 |
+
}
|
| 1442 |
+
|
| 1443 |
+
std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl;
|
| 1444 |
+
|
| 1445 |
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
| 1446 |
+
ggml_vk_print_gpu_info(i);
|
| 1447 |
}
|
| 1448 |
|
| 1449 |
vk_instance_initialized = true;
|
|
|
|
| 1669 |
|
| 1670 |
switch (src0_type) {
|
| 1671 |
case GGML_TYPE_Q4_0:
|
| 1672 |
+
case GGML_TYPE_Q4_1:
|
| 1673 |
+
case GGML_TYPE_Q5_0:
|
| 1674 |
+
case GGML_TYPE_Q5_1:
|
| 1675 |
+
case GGML_TYPE_Q8_0:
|
| 1676 |
+
case GGML_TYPE_Q2_K:
|
| 1677 |
+
case GGML_TYPE_Q3_K:
|
| 1678 |
+
case GGML_TYPE_Q4_K:
|
| 1679 |
+
case GGML_TYPE_Q5_K:
|
| 1680 |
+
case GGML_TYPE_Q6_K:
|
| 1681 |
break;
|
| 1682 |
default:
|
| 1683 |
return nullptr;
|
|
|
|
| 2145 |
ggml_vk_submit(subctx, ctx->fence);
|
| 2146 |
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
|
| 2147 |
ctx->device->device.resetFences({ ctx->fence });
|
|
|
|
| 2148 |
}
|
| 2149 |
}
|
| 2150 |
|
|
|
|
| 2241 |
for (auto& cpy : subctx->out_memcpys) {
|
| 2242 |
memcpy(cpy.dst, cpy.src, cpy.n);
|
| 2243 |
}
|
|
|
|
| 2244 |
}
|
| 2245 |
}
|
| 2246 |
|
|
|
|
| 2407 |
return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned);
|
| 2408 |
case VK_VENDOR_ID_INTEL:
|
| 2409 |
return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned);
|
| 2410 |
+
default:
|
| 2411 |
+
break;
|
| 2412 |
}
|
| 2413 |
|
| 2414 |
if (m <= 32 || n <= 32) {
|
|
|
|
| 2534 |
src1_uma = d_Qy != nullptr;
|
| 2535 |
}
|
| 2536 |
|
| 2537 |
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
| 2538 |
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
|
|
|
|
|
|
|
|
|
| 2539 |
|
| 2540 |
const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig;
|
| 2541 |
|
|
|
|
| 2577 |
uint64_t x_buf_offset = 0;
|
| 2578 |
vk_buffer d_Y;
|
| 2579 |
uint64_t y_buf_offset = 0;
|
| 2580 |
+
if (!src0_uma) {
|
|
|
|
|
|
|
| 2581 |
d_Qx = extra_src0->buffer_gpu.lock();
|
| 2582 |
qx_buf_offset = extra_src0->offset;
|
| 2583 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2584 |
}
|
| 2585 |
+
if (!src1_uma) {
|
|
|
|
|
|
|
| 2586 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2587 |
qy_buf_offset = extra_src1->offset;
|
| 2588 |
GGML_ASSERT(d_Qy != nullptr);
|
|
|
|
| 2634 |
|
| 2635 |
if (x_non_contig) {
|
| 2636 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
| 2637 |
+
} else if (qx_needs_dequant) {
|
| 2638 |
+
const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
|
| 2639 |
+
ggml_vk_sync_buffers(subctx);
|
| 2640 |
+
ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2641 |
}
|
| 2642 |
if (y_non_contig) {
|
| 2643 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
|
|
|
|
|
| 2644 |
}
|
| 2645 |
|
| 2646 |
uint32_t stride_batch_x = ne00*ne01;
|
| 2647 |
uint32_t stride_batch_y = ne10*ne11;
|
| 2648 |
|
| 2649 |
+
if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) {
|
| 2650 |
stride_batch_x = src0->nb[0] / ggml_type_size(src0->type);
|
| 2651 |
}
|
| 2652 |
|
| 2653 |
+
if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) {
|
| 2654 |
stride_batch_y = src1->nb[0] / ggml_type_size(src1->type);
|
| 2655 |
}
|
| 2656 |
|
|
|
|
| 2710 |
src1_uma = d_Qy != nullptr;
|
| 2711 |
}
|
| 2712 |
|
| 2713 |
+
const bool x_non_contig = !ggml_vk_dim01_contiguous(src0);
|
| 2714 |
+
const bool y_non_contig = !ggml_vk_dim01_contiguous(src1);
|
|
|
|
|
|
|
|
|
|
| 2715 |
|
| 2716 |
const bool f16_f32_kernel = src1->type == GGML_TYPE_F32;
|
| 2717 |
|
|
|
|
| 2735 |
uint64_t x_buf_offset = 0;
|
| 2736 |
vk_buffer d_Y;
|
| 2737 |
uint64_t y_buf_offset = 0;
|
| 2738 |
+
if(!src0_uma) {
|
|
|
|
|
|
|
| 2739 |
d_Qx = extra_src0->buffer_gpu.lock();
|
| 2740 |
qx_buf_offset = extra_src0->offset;
|
| 2741 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2742 |
}
|
| 2743 |
+
if(!src1_uma) {
|
|
|
|
|
|
|
| 2744 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2745 |
qy_buf_offset = extra_src1->offset;
|
| 2746 |
GGML_ASSERT(d_Qy != nullptr);
|
|
|
|
| 2787 |
if (x_non_contig) {
|
| 2788 |
GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
|
| 2789 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE });
|
|
|
|
|
|
|
|
|
|
| 2790 |
}
|
| 2791 |
if (y_non_contig) {
|
| 2792 |
GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne);
|
| 2793 |
ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE });
|
|
|
|
|
|
|
| 2794 |
}
|
| 2795 |
|
| 2796 |
for (uint64_t i13 = 0; i13 < ne13; i13++) {
|
|
|
|
| 2871 |
src1_uma = d_Qy != nullptr;
|
| 2872 |
}
|
| 2873 |
|
|
|
|
|
|
|
| 2874 |
const uint64_t x_ne = ne00 * ne01 * ne02;
|
| 2875 |
const uint64_t y_ne = ne10 * ne11 * ne12;
|
| 2876 |
const uint64_t d_ne = ne01 * ne11 * ne12;
|
|
|
|
| 2885 |
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
| 2886 |
const uint64_t qx_buf_offset = extra_src0->offset;
|
| 2887 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2888 |
+
if (!src1_uma) {
|
|
|
|
|
|
|
| 2889 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2890 |
qy_buf_offset = extra_src1->offset;
|
| 2891 |
GGML_ASSERT(d_Qx != nullptr);
|
|
|
|
| 2900 |
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 2901 |
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
| 2902 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2903 |
// compute
|
| 2904 |
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 2905 |
ggml_vk_sync_buffers(subctx);
|
|
|
|
| 2955 |
src1_uma = d_Qy != nullptr;
|
| 2956 |
}
|
| 2957 |
|
|
|
|
|
|
|
| 2958 |
const uint64_t d_ne = ne01 * ne11 * ne12;
|
| 2959 |
|
| 2960 |
const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t);
|
|
|
|
| 2970 |
vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
|
| 2971 |
const uint64_t qx_buf_offset = extra_src0->offset;
|
| 2972 |
GGML_ASSERT(d_Qx != nullptr);
|
| 2973 |
+
if (!src1_uma) {
|
|
|
|
|
|
|
| 2974 |
d_Qy = extra_src1->buffer_gpu.lock();
|
| 2975 |
qy_buf_offset = extra_src1->offset;
|
| 2976 |
GGML_ASSERT(d_Qx != nullptr);
|
|
|
|
| 2985 |
const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 2986 |
const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
|
| 2987 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2988 |
// compute
|
| 2989 |
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 2990 |
ggml_vk_sync_buffers(subctx);
|
|
|
|
| 3240 |
}
|
| 3241 |
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
| 3242 |
#endif
|
| 3243 |
+
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
| 3244 |
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
| 3245 |
GGML_ASSERT(dst->extra != nullptr);
|
| 3246 |
const uint64_t ne00 = src0->ne[0];
|
|
|
|
| 3308 |
}
|
| 3309 |
}
|
| 3310 |
|
| 3311 |
+
uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3312 |
uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
| 3313 |
uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
|
| 3314 |
uint64_t d_sz = ggml_type_size(dst->type) * ne0;
|
|
|
|
| 3323 |
GGML_ASSERT(d_D != nullptr);
|
| 3324 |
uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
|
| 3325 |
GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT
|
| 3326 |
+
if(!src0_uma) {
|
|
|
|
|
|
|
| 3327 |
d_X = extra_src0->buffer_gpu.lock();
|
| 3328 |
x_buf_offset = extra_src0->offset;
|
| 3329 |
GGML_ASSERT(d_X != nullptr);
|
| 3330 |
}
|
| 3331 |
+
if (use_src1 && !src1_uma) {
|
|
|
|
|
|
|
| 3332 |
d_Y = extra_src1->buffer_gpu.lock();
|
| 3333 |
y_buf_offset = extra_src1->offset;
|
| 3334 |
GGML_ASSERT(d_Y != nullptr);
|
| 3335 |
}
|
| 3336 |
|
|
|
|
| 3337 |
if (use_src2 && !src2_uma) {
|
| 3338 |
d_Z = extra_src2->buffer_gpu.lock();
|
| 3339 |
z_buf_offset = extra_src2->offset;
|
| 3340 |
GGML_ASSERT(d_Z != nullptr);
|
| 3341 |
}
|
| 3342 |
|
| 3343 |
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS) {
|
|
|
|
|
|
|
| 3344 |
x_sz = ggml_nbytes(src0);
|
| 3345 |
+
y_sz = use_src1 ? ggml_nbytes(src1) : 0;
|
| 3346 |
d_sz = ggml_nbytes(dst);
|
| 3347 |
|
| 3348 |
+
if (x_buf_offset + x_sz >= d_X->size) {
|
| 3349 |
x_sz = VK_WHOLE_SIZE;
|
| 3350 |
}
|
| 3351 |
+
if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
|
| 3352 |
+
y_sz = VK_WHOLE_SIZE;
|
| 3353 |
+
}
|
| 3354 |
+
if (d_buf_offset + d_sz >= d_D->size) {
|
| 3355 |
d_sz = VK_WHOLE_SIZE;
|
| 3356 |
}
|
| 3357 |
}
|
| 3358 |
|
| 3359 |
std::array<uint32_t, 3> elements;
|
| 3360 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3361 |
// Single call if dimension 2 is contiguous
|
| 3362 |
+
if (op == GGML_OP_CPY || op == GGML_OP_GET_ROWS || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
|
| 3363 |
ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
|
| 3364 |
|
| 3365 |
switch (dst->op) {
|
|
|
|
| 3372 |
case GGML_OP_ROPE:
|
| 3373 |
elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 };
|
| 3374 |
break;
|
| 3375 |
+
case GGML_OP_GET_ROWS:
|
| 3376 |
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
| 3377 |
+
break;
|
| 3378 |
default:
|
| 3379 |
elements = { (uint32_t)ggml_nelements(src0), 1, 1 };
|
| 3380 |
break;
|
| 3381 |
}
|
| 3382 |
|
| 3383 |
+
if (op != GGML_OP_CPY && op != GGML_OP_GET_ROWS) {
|
| 3384 |
if (x_sz != VK_WHOLE_SIZE) {
|
| 3385 |
x_sz *= ne02 * ne03;
|
| 3386 |
}
|
| 3387 |
+
if (use_src1 && y_sz != VK_WHOLE_SIZE) {
|
| 3388 |
y_sz *= ne12 * ne13;
|
| 3389 |
}
|
| 3390 |
if (d_sz != VK_WHOLE_SIZE) {
|
|
|
|
| 3439 |
case GGML_OP_ROPE:
|
| 3440 |
elements = { (uint32_t)ne01, (uint32_t)ne00, 1 };
|
| 3441 |
break;
|
| 3442 |
+
case GGML_OP_GET_ROWS:
|
| 3443 |
+
elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) };
|
| 3444 |
+
break;
|
| 3445 |
default:
|
| 3446 |
elements = { (uint32_t)ne0, 1, 1 };
|
| 3447 |
break;
|
|
|
|
| 3476 |
}
|
| 3477 |
|
| 3478 |
static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3479 |
+
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
| 3480 |
+
const uint32_t src1_type_size = ggml_type_size(src1->type);
|
| 3481 |
+
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
| 3482 |
+
|
| 3483 |
+
ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
|
| 3484 |
+
(uint32_t)ggml_nelements(src0),
|
| 3485 |
+
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
| 3486 |
+
(uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
|
| 3487 |
+
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
| 3488 |
+
0,
|
| 3489 |
+
0.0f, 0.0f,
|
| 3490 |
+
});
|
| 3491 |
}
|
| 3492 |
|
| 3493 |
static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3643 |
if (is_neox) {
|
| 3644 |
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 3645 |
const float inv_ndims = -1.0f / n_dims;
|
| 3646 |
+
ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims });
|
| 3647 |
} else {
|
| 3648 |
+
ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} });
|
| 3649 |
}
|
| 3650 |
}
|
| 3651 |
|
|
|
|
| 3654 |
ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC });
|
| 3655 |
}
|
| 3656 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3657 |
#ifdef GGML_VULKAN_RUN_TESTS
|
| 3658 |
static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) {
|
| 3659 |
if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) {
|
|
|
|
| 3676 |
val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0);
|
| 3677 |
} else if (type == GGML_TYPE_F16) {
|
| 3678 |
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
|
| 3679 |
+
} else {
|
| 3680 |
+
GGML_ASSERT(false);
|
| 3681 |
}
|
| 3682 |
fprintf(stderr, "% 7.2f ", val);
|
| 3683 |
} else {
|
|
|
|
| 3979 |
val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
| 3980 |
} else if (tensor->type == GGML_TYPE_F16) {
|
| 3981 |
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
| 3982 |
+
} else {
|
| 3983 |
+
GGML_ASSERT(false);
|
| 3984 |
}
|
| 3985 |
fprintf(stderr, "% 7.2f ", val);
|
| 3986 |
} else {
|
|
|
|
| 4396 |
|
| 4397 |
std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl;
|
| 4398 |
|
| 4399 |
+
if (avg_err > 0.01 || std::isnan(avg_err)) {
|
| 4400 |
std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl;
|
| 4401 |
std::cerr << "Actual result: " << std::endl << std::endl;
|
| 4402 |
ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b);
|
|
|
|
| 4446 |
return extra;
|
| 4447 |
}
|
| 4448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4449 |
static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){
|
| 4450 |
#ifdef GGML_VULKAN_DEBUG
|
| 4451 |
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
| 4452 |
#endif
|
| 4453 |
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4454 |
return;
|
| 4455 |
}
|
| 4456 |
|
| 4457 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4458 |
|
| 4459 |
ggml_tensor * src0 = node->src[0];
|
| 4460 |
ggml_tensor * src1 = node->src[1];
|
|
|
|
| 4474 |
const int64_t ne22 = node->ne[2];
|
| 4475 |
const int64_t ne23 = node->ne[3];
|
| 4476 |
|
| 4477 |
+
const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16;
|
| 4478 |
+
const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16;
|
| 4479 |
+
|
| 4480 |
+
const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0);
|
| 4481 |
+
const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1);
|
| 4482 |
+
|
| 4483 |
+
const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig;
|
| 4484 |
+
|
| 4485 |
+
bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false;
|
| 4486 |
+
|
| 4487 |
+
const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig);
|
| 4488 |
+
const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig);
|
| 4489 |
|
| 4490 |
int split_k;
|
| 4491 |
if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) {
|
|
|
|
| 4497 |
const uint32_t y_ne = ne10 * ne11;
|
| 4498 |
const uint32_t d_ne = ne20 * ne21;
|
| 4499 |
|
| 4500 |
+
const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
|
| 4501 |
+
const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
|
|
|
|
|
|
|
| 4502 |
uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
|
| 4503 |
const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
|
| 4504 |
|
|
|
|
| 4541 |
break;
|
| 4542 |
case GGML_OP_MUL_MAT:
|
| 4543 |
case GGML_OP_MUL_MAT_ID:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4544 |
if (ctx->prealloc_size_x < x_sz) {
|
| 4545 |
ctx->prealloc_size_x = x_sz;
|
| 4546 |
}
|
|
|
|
| 4564 |
return;
|
| 4565 |
}
|
| 4566 |
#ifdef GGML_VULKAN_DEBUG
|
| 4567 |
+
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
| 4568 |
#endif
|
| 4569 |
#if defined(GGML_VULKAN_RUN_TESTS)
|
| 4570 |
ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul,
|
|
|
|
| 4627 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0);
|
| 4628 |
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0);
|
| 4629 |
|
| 4630 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K);
|
| 4631 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K);
|
| 4632 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K);
|
| 4633 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K);
|
| 4634 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K);
|
| 4635 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K);
|
| 4636 |
+
|
| 4637 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K);
|
| 4638 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K);
|
| 4639 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K);
|
| 4640 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K);
|
| 4641 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K);
|
| 4642 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K);
|
| 4643 |
+
|
| 4644 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K);
|
| 4645 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K);
|
| 4646 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K);
|
| 4647 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K);
|
| 4648 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K);
|
| 4649 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K);
|
| 4650 |
+
|
| 4651 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K);
|
| 4652 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K);
|
| 4653 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K);
|
| 4654 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K);
|
| 4655 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K);
|
| 4656 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K);
|
| 4657 |
+
|
| 4658 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K);
|
| 4659 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K);
|
| 4660 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K);
|
| 4661 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K);
|
| 4662 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K);
|
| 4663 |
+
ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K);
|
| 4664 |
+
|
| 4665 |
std::cerr << std::endl;
|
| 4666 |
|
| 4667 |
const std::vector<size_t> vals {
|
|
|
|
| 4701 |
GGML_ASSERT(false);
|
| 4702 |
#endif
|
| 4703 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4704 |
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
|
| 4705 |
// Resize buffer
|
| 4706 |
if (ctx->prealloc_x != nullptr) {
|
|
|
|
| 4734 |
}
|
| 4735 |
|
| 4736 |
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
| 4737 |
+
if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4738 |
return;
|
| 4739 |
}
|
| 4740 |
|
|
|
|
| 4762 |
}
|
| 4763 |
break;
|
| 4764 |
case GGML_OP_REPEAT:
|
| 4765 |
+
case GGML_OP_GET_ROWS:
|
| 4766 |
case GGML_OP_ADD:
|
| 4767 |
case GGML_OP_MUL:
|
| 4768 |
case GGML_OP_SCALE:
|
|
|
|
| 4786 |
case GGML_OP_ARGSORT:
|
| 4787 |
break;
|
| 4788 |
default:
|
| 4789 |
+
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
| 4790 |
+
GGML_ASSERT(false);
|
|
|
|
|
|
|
| 4791 |
return;
|
| 4792 |
}
|
| 4793 |
|
|
|
|
| 4836 |
case GGML_OP_PERMUTE:
|
| 4837 |
case GGML_OP_TRANSPOSE:
|
| 4838 |
case GGML_OP_NONE:
|
|
|
|
|
|
|
| 4839 |
break;
|
| 4840 |
case GGML_OP_NORM:
|
| 4841 |
ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
|
|
|
|
| 4902 |
}
|
| 4903 |
|
| 4904 |
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
| 4905 |
+
if (ctx->disable) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4906 |
return false;
|
| 4907 |
}
|
| 4908 |
|
|
|
|
| 4945 |
break;
|
| 4946 |
case GGML_OP_MUL_MAT:
|
| 4947 |
case GGML_OP_MUL_MAT_ID:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4948 |
extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 4949 |
|
| 4950 |
break;
|
|
|
|
| 5058 |
#endif
|
| 5059 |
ggml_vk_graph_cleanup(ctx);
|
| 5060 |
|
|
|
|
|
|
|
| 5061 |
ggml_vk_destroy_buffer(ctx->prealloc_x);
|
| 5062 |
ggml_vk_destroy_buffer(ctx->prealloc_y);
|
| 5063 |
ggml_vk_destroy_buffer(ctx->prealloc_split_k);
|
|
|
|
| 5068 |
ggml_vk_destroy_buffer(buffer);
|
| 5069 |
}
|
| 5070 |
|
|
|
|
|
|
|
| 5071 |
ctx->prealloc_size_x = 0;
|
| 5072 |
ctx->prealloc_size_y = 0;
|
| 5073 |
ctx->prealloc_size_split_k = 0;
|
|
|
|
| 5098 |
snprintf(description, description_size, "%s", props.deviceName.data());
|
| 5099 |
}
|
| 5100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5101 |
// backend interface
|
| 5102 |
|
| 5103 |
#define UNUSED GGML_UNUSED
|
|
|
|
| 5309 |
/* .is_host = */ NULL,
|
| 5310 |
};
|
| 5311 |
|
| 5312 |
+
GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) {
|
| 5313 |
#ifdef GGML_VULKAN_DEBUG
|
| 5314 |
+
std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl;
|
| 5315 |
#endif
|
| 5316 |
|
| 5317 |
+
GGML_ASSERT(dev_num < vk_instance.device_indices.size());
|
| 5318 |
|
| 5319 |
+
ggml_backend_vk_init(dev_num);
|
| 5320 |
|
| 5321 |
+
return &vk_instance.buffer_types[dev_num];
|
| 5322 |
}
|
| 5323 |
|
| 5324 |
// host buffer type
|
|
|
|
| 5487 |
vk_buffer src_buf = src_extra->buffer_gpu.lock();
|
| 5488 |
vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
|
| 5489 |
|
| 5490 |
+
ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
|
| 5491 |
return true;
|
| 5492 |
}
|
| 5493 |
|
|
|
|
| 5521 |
}
|
| 5522 |
|
| 5523 |
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 5524 |
+
#ifdef GGML_VULKAN_DEBUG
|
| 5525 |
+
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
| 5526 |
+
#endif
|
| 5527 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 5528 |
|
| 5529 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
|
|
|
| 5584 |
}
|
| 5585 |
break;
|
| 5586 |
case GGML_OP_MUL_MAT:
|
| 5587 |
+
// case GGML_OP_MUL_MAT_ID:
|
| 5588 |
{
|
| 5589 |
+
switch (op->src[0]->type) {
|
| 5590 |
+
case GGML_TYPE_F32:
|
| 5591 |
+
case GGML_TYPE_F16:
|
| 5592 |
+
case GGML_TYPE_Q4_0:
|
| 5593 |
+
case GGML_TYPE_Q4_1:
|
| 5594 |
+
case GGML_TYPE_Q5_0:
|
| 5595 |
+
case GGML_TYPE_Q5_1:
|
| 5596 |
+
case GGML_TYPE_Q8_0:
|
| 5597 |
+
case GGML_TYPE_Q2_K:
|
| 5598 |
+
case GGML_TYPE_Q3_K:
|
| 5599 |
+
case GGML_TYPE_Q4_K:
|
| 5600 |
+
case GGML_TYPE_Q5_K:
|
| 5601 |
+
case GGML_TYPE_Q6_K:
|
| 5602 |
+
break;
|
| 5603 |
+
default:
|
| 5604 |
+
return false;
|
| 5605 |
+
}
|
| 5606 |
struct ggml_tensor * a;
|
| 5607 |
struct ggml_tensor * b;
|
| 5608 |
if (op->op == GGML_OP_MUL_MAT) {
|
|
|
|
| 5617 |
}
|
| 5618 |
return true;
|
| 5619 |
} break;
|
| 5620 |
+
case GGML_OP_GET_ROWS:
|
| 5621 |
+
{
|
| 5622 |
+
switch (op->src[0]->type) {
|
| 5623 |
+
case GGML_TYPE_F32:
|
| 5624 |
+
case GGML_TYPE_F16:
|
| 5625 |
+
case GGML_TYPE_Q4_0:
|
| 5626 |
+
case GGML_TYPE_Q4_1:
|
| 5627 |
+
case GGML_TYPE_Q5_0:
|
| 5628 |
+
case GGML_TYPE_Q5_1:
|
| 5629 |
+
case GGML_TYPE_Q8_0:
|
| 5630 |
+
return true;
|
| 5631 |
+
default:
|
| 5632 |
+
return false;
|
| 5633 |
+
}
|
| 5634 |
+
} break;
|
| 5635 |
case GGML_OP_CPY:
|
| 5636 |
+
case GGML_OP_DUP:
|
| 5637 |
{
|
| 5638 |
ggml_type src0_type = op->src[0]->type;
|
| 5639 |
+
ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type;
|
| 5640 |
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
| 5641 |
return true;
|
| 5642 |
}
|
|
|
|
| 5648 |
}
|
| 5649 |
return false;
|
| 5650 |
} break;
|
|
|
|
| 5651 |
// case GGML_OP_REPEAT:
|
| 5652 |
// {
|
| 5653 |
// ggml_type src0_type = op->src[0]->type;
|
|
|
|
| 5684 |
UNUSED(backend);
|
| 5685 |
}
|
| 5686 |
|
| 5687 |
+
GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
| 5688 |
+
const ggml_tensor * dst = op;
|
| 5689 |
+
|
| 5690 |
+
const int min_batch_size = 32;
|
| 5691 |
+
|
| 5692 |
+
if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
|
| 5693 |
+
return true;
|
| 5694 |
+
}
|
| 5695 |
+
|
| 5696 |
+
return false;
|
| 5697 |
+
|
| 5698 |
+
UNUSED(backend);
|
| 5699 |
+
}
|
| 5700 |
+
|
| 5701 |
// TODO: enable async and synchronize
|
| 5702 |
static ggml_backend_i ggml_backend_vk_interface = {
|
| 5703 |
/* .get_name = */ ggml_backend_vk_name,
|
|
|
|
| 5712 |
/* .graph_plan_compute = */ NULL,
|
| 5713 |
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
| 5714 |
/* .supports_op = */ ggml_backend_vk_supports_op,
|
| 5715 |
+
/* .offload_op = */ ggml_backend_vk_offload_op,
|
| 5716 |
/* .event_new = */ NULL,
|
| 5717 |
/* .event_free = */ NULL,
|
| 5718 |
/* .event_record = */ NULL,
|
|
|
|
| 5725 |
return &guid;
|
| 5726 |
}
|
| 5727 |
|
| 5728 |
+
GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
|
| 5729 |
+
if (vk_instance.initialized[dev_num]) {
|
| 5730 |
+
return vk_instance.backends[dev_num];
|
| 5731 |
}
|
| 5732 |
#ifdef GGML_VULKAN_DEBUG
|
| 5733 |
+
std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl;
|
| 5734 |
#endif
|
| 5735 |
|
| 5736 |
+
ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num];
|
| 5737 |
+
ggml_vk_init(ctx, dev_num);
|
| 5738 |
+
ctx->name = GGML_VK_NAME + std::to_string(dev_num);
|
| 5739 |
+
vk_instance.buffer_types[dev_num] = {
|
| 5740 |
/* .iface = */ ggml_backend_vk_buffer_type_interface,
|
| 5741 |
/* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx },
|
| 5742 |
};
|
| 5743 |
+
vk_instance.initialized[dev_num] = true;
|
| 5744 |
|
| 5745 |
ggml_backend_t vk_backend = new ggml_backend {
|
| 5746 |
/* .guid = */ ggml_backend_vk_guid(),
|
|
|
|
| 5748 |
/* .context = */ &vk_instance.contexts[ctx->idx],
|
| 5749 |
};
|
| 5750 |
|
| 5751 |
+
vk_instance.backends[dev_num] = vk_backend;
|
| 5752 |
|
| 5753 |
return vk_backend;
|
| 5754 |
}
|
|
|
|
| 5792 |
extern "C" GGML_CALL int ggml_backend_vk_reg_devices();
|
| 5793 |
|
| 5794 |
GGML_CALL int ggml_backend_vk_reg_devices() {
|
| 5795 |
+
ggml_vk_instance_init();
|
| 5796 |
+
|
| 5797 |
+
for (size_t i = 0; i < vk_instance.device_indices.size(); i++) {
|
| 5798 |
char name[128];
|
| 5799 |
+
snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i);
|
| 5800 |
+
ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT
|
| 5801 |
}
|
| 5802 |
return vk_instance.device_indices.size();
|
| 5803 |
}
|
|
|
|
| 5881 |
val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
|
| 5882 |
} else if (tensor->type == GGML_TYPE_F16) {
|
| 5883 |
val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
|
| 5884 |
+
} else {
|
| 5885 |
+
GGML_ASSERT(false);
|
| 5886 |
}
|
| 5887 |
fprintf(stderr, "% 7.2f ", val);
|
| 5888 |
} else {
|
|
|
|
| 5977 |
return;
|
| 5978 |
}
|
| 5979 |
|
| 5980 |
+
#ifdef GGML_VULKAN_DEBUG
|
| 5981 |
+
std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl;
|
| 5982 |
+
#endif
|
| 5983 |
+
|
| 5984 |
ggml_tensor * src0 = tensor->src[0];
|
| 5985 |
ggml_tensor * src1 = tensor->src[1];
|
| 5986 |
ggml_tensor * src2 = tensor->src[2];
|
|
|
|
| 6240 |
tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]);
|
| 6241 |
} else if (tensor->op == GGML_OP_TRANSPOSE) {
|
| 6242 |
tensor_clone = ggml_transpose(ggml_ctx, src0_clone);
|
| 6243 |
+
} else if (tensor->op == GGML_OP_GET_ROWS) {
|
| 6244 |
+
tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone);
|
| 6245 |
} else {
|
| 6246 |
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
|
| 6247 |
GGML_ASSERT(false);
|
|
|
|
| 6292 |
return;
|
| 6293 |
}
|
| 6294 |
|
| 6295 |
+
#ifdef GGML_VULKAN_DEBUG
|
| 6296 |
+
std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl;
|
| 6297 |
+
#endif
|
| 6298 |
+
|
| 6299 |
ggml_tensor * src0 = tensor->src[0];
|
| 6300 |
ggml_tensor * src1 = tensor->src[1];
|
| 6301 |
|
|
|
|
| 6439 |
free(tensor_data);
|
| 6440 |
}
|
| 6441 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6442 |
#endif
|
ggml-vulkan.h
CHANGED
|
@@ -11,17 +11,6 @@ extern "C" {
|
|
| 11 |
#define GGML_VK_MAX_DEVICES 16
|
| 12 |
|
| 13 |
GGML_API void ggml_vk_instance_init(void);
|
| 14 |
-
GGML_API void ggml_vk_init_cpu_assist(void);
|
| 15 |
-
|
| 16 |
-
GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
|
| 17 |
-
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
|
| 18 |
-
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
|
| 19 |
-
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 20 |
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
| 21 |
-
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
| 22 |
-
#endif
|
| 23 |
-
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
|
| 24 |
-
GGML_API void ggml_vk_free_cpu_assist(void);
|
| 25 |
|
| 26 |
// backend API
|
| 27 |
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
|
|
|
| 11 |
#define GGML_VK_MAX_DEVICES 16
|
| 12 |
|
| 13 |
GGML_API void ggml_vk_instance_init(void);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
// backend API
|
| 16 |
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
ggml.c
CHANGED
|
@@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
| 278 |
#include <Accelerate/Accelerate.h>
|
| 279 |
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
| 280 |
#include "ggml-opencl.h"
|
| 281 |
-
#elif defined(GGML_USE_VULKAN)
|
| 282 |
-
#include "ggml-vulkan.h"
|
| 283 |
#endif
|
| 284 |
#elif defined(GGML_USE_OPENBLAS)
|
| 285 |
#if defined(GGML_BLAS_USE_MKL)
|
|
@@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
|
| 289 |
#endif
|
| 290 |
#elif defined(GGML_USE_CLBLAST)
|
| 291 |
#include "ggml-opencl.h"
|
| 292 |
-
#elif defined(GGML_USE_VULKAN)
|
| 293 |
-
#include "ggml-vulkan.h"
|
| 294 |
#endif
|
| 295 |
|
| 296 |
// floating point type used to accumulate sums
|
|
@@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
| 2717 |
|
| 2718 |
#if defined(GGML_USE_CLBLAST)
|
| 2719 |
ggml_cl_init();
|
| 2720 |
-
#elif defined(GGML_USE_VULKAN)
|
| 2721 |
-
ggml_vk_init_cpu_assist();
|
| 2722 |
#endif
|
| 2723 |
|
| 2724 |
ggml_setup_op_has_task_pass();
|
|
@@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
| 16128 |
return;
|
| 16129 |
}
|
| 16130 |
|
| 16131 |
-
#if defined(GGML_USE_VULKAN)
|
| 16132 |
-
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
| 16133 |
-
#ifdef GGML_VULKAN_CHECK_RESULTS
|
| 16134 |
-
if (skip_cpu) {
|
| 16135 |
-
ggml_vk_check_results_1_cpu_assist(params, tensor);
|
| 16136 |
-
}
|
| 16137 |
-
#endif
|
| 16138 |
-
if (skip_cpu) {
|
| 16139 |
-
return;
|
| 16140 |
-
}
|
| 16141 |
-
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
| 16142 |
-
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
| 16143 |
-
#endif // GGML_USE_VULKAN
|
| 16144 |
-
|
| 16145 |
switch (tensor->op) {
|
| 16146 |
case GGML_OP_DUP:
|
| 16147 |
{
|
|
@@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
| 18617 |
}
|
| 18618 |
}
|
| 18619 |
|
| 18620 |
-
#ifdef GGML_USE_VULKAN
|
| 18621 |
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 18622 |
-
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
|
| 18623 |
-
}
|
| 18624 |
-
ggml_vk_preallocate_buffers_cpu_assist();
|
| 18625 |
-
|
| 18626 |
-
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 18627 |
-
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
|
| 18628 |
-
}
|
| 18629 |
-
#endif
|
| 18630 |
-
|
| 18631 |
const int n_threads = cplan->n_threads;
|
| 18632 |
|
| 18633 |
struct ggml_compute_state_shared state_shared = {
|
|
@@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|
| 18684 |
}
|
| 18685 |
}
|
| 18686 |
|
| 18687 |
-
#ifdef GGML_USE_VULKAN
|
| 18688 |
-
ggml_vk_graph_cleanup_cpu_assist();
|
| 18689 |
-
#endif
|
| 18690 |
-
|
| 18691 |
// performance stats (graph)
|
| 18692 |
{
|
| 18693 |
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|
|
|
|
| 278 |
#include <Accelerate/Accelerate.h>
|
| 279 |
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
| 280 |
#include "ggml-opencl.h"
|
|
|
|
|
|
|
| 281 |
#endif
|
| 282 |
#elif defined(GGML_USE_OPENBLAS)
|
| 283 |
#if defined(GGML_BLAS_USE_MKL)
|
|
|
|
| 287 |
#endif
|
| 288 |
#elif defined(GGML_USE_CLBLAST)
|
| 289 |
#include "ggml-opencl.h"
|
|
|
|
|
|
|
| 290 |
#endif
|
| 291 |
|
| 292 |
// floating point type used to accumulate sums
|
|
|
|
| 2713 |
|
| 2714 |
#if defined(GGML_USE_CLBLAST)
|
| 2715 |
ggml_cl_init();
|
|
|
|
|
|
|
| 2716 |
#endif
|
| 2717 |
|
| 2718 |
ggml_setup_op_has_task_pass();
|
|
|
|
| 16122 |
return;
|
| 16123 |
}
|
| 16124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16125 |
switch (tensor->op) {
|
| 16126 |
case GGML_OP_DUP:
|
| 16127 |
{
|
|
|
|
| 18597 |
}
|
| 18598 |
}
|
| 18599 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18600 |
const int n_threads = cplan->n_threads;
|
| 18601 |
|
| 18602 |
struct ggml_compute_state_shared state_shared = {
|
|
|
|
| 18653 |
}
|
| 18654 |
}
|
| 18655 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18656 |
// performance stats (graph)
|
| 18657 |
{
|
| 18658 |
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
|