Spaces:
Sleeping
Sleeping
Commit
·
2bfeba3
1
Parent(s):
645c367
Vulkan Embedding Fix (llama/7360)
Browse files* Fix empty Vulkan host buffers
Add fp32 fp16 matmul shader
Fix matmul shader alignment
* Remove deprecated tensor->backend uses
* Fix Vulkan validation errors on embedding models with no offloaded layers
* Fix Vulkan llava segfault when not offloading layers
- ggml-vulkan.cpp +72 -112
ggml-vulkan.cpp
CHANGED
|
@@ -114,6 +114,7 @@ struct vk_device {
|
|
| 114 |
size_t idx;
|
| 115 |
|
| 116 |
vk_matmul_pipeline pipeline_matmul_f32;
|
|
|
|
| 117 |
vk_matmul_pipeline pipeline_matmul_f16;
|
| 118 |
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
| 119 |
vk_pipeline pipeline_matmul_split_k_reduce;
|
|
@@ -375,13 +376,12 @@ struct ggml_backend_vk_context {
|
|
| 375 |
vk_context * compute_ctx;
|
| 376 |
vk_context * transfer_ctx;
|
| 377 |
|
| 378 |
-
bool disable;
|
| 379 |
bool initialized;
|
| 380 |
|
| 381 |
size_t idx;
|
| 382 |
};
|
| 383 |
|
| 384 |
-
struct
|
| 385 |
vk::Instance instance;
|
| 386 |
|
| 387 |
std::vector<size_t> device_indices;
|
|
@@ -423,7 +423,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 423 |
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
| 424 |
|
| 425 |
static bool vk_instance_initialized = false;
|
| 426 |
-
static
|
| 427 |
|
| 428 |
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
| 429 |
|
|
@@ -1013,6 +1013,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1013 |
uint32_t s_align = 32;
|
| 1014 |
|
| 1015 |
ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
|
|
|
| 1016 |
ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1017 |
ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1018 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
@@ -1048,6 +1049,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1048 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
| 1049 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
| 1050 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1051 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1052 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1053 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
|
@@ -1230,6 +1238,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
|
|
| 1230 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
| 1231 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
| 1232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1233 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1234 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1235 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
|
@@ -1859,7 +1874,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
|
|
| 1859 |
ctx->compute_ctx = nullptr;
|
| 1860 |
ctx->transfer_ctx = nullptr;
|
| 1861 |
|
| 1862 |
-
ctx->disable = false;
|
| 1863 |
ctx->initialized = true;
|
| 1864 |
|
| 1865 |
ctx->idx = idx;
|
|
@@ -1903,6 +1917,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
|
|
| 1903 |
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
| 1904 |
return ctx->device->pipeline_matmul_f32;
|
| 1905 |
}
|
|
|
|
|
|
|
|
|
|
| 1906 |
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
| 1907 |
return ctx->device->pipeline_matmul_f16_f32;
|
| 1908 |
}
|
|
@@ -2722,7 +2739,7 @@ static void ggml_vk_matmul(
|
|
| 2722 |
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
| 2723 |
uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
|
| 2724 |
#ifdef GGML_VULKAN_DEBUG
|
| 2725 |
-
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
| 2726 |
#endif
|
| 2727 |
ggml_vk_sync_buffers(subctx);
|
| 2728 |
if (split_k == 1) {
|
|
@@ -2792,7 +2809,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
|
|
| 2792 |
|
| 2793 |
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
| 2794 |
#ifdef GGML_VULKAN_DEBUG
|
| 2795 |
-
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ",
|
| 2796 |
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
| 2797 |
#endif
|
| 2798 |
const int tensor_type_size = ggml_type_size(tensor->type);
|
|
@@ -2812,9 +2829,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
|
|
| 2812 |
|
| 2813 |
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 2814 |
#ifdef GGML_VULKAN_DEBUG
|
| 2815 |
-
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
| 2816 |
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
| 2817 |
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
| 2818 |
#endif
|
| 2819 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
| 2820 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
@@ -2982,19 +2999,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
|
|
| 2982 |
ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
|
| 2983 |
0, 0, 0, 0, 1
|
| 2984 |
); // NOLINT
|
| 2985 |
-
|
| 2986 |
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
| 2987 |
-
// copy dst to host
|
| 2988 |
-
float * d = (float *) ((char *) dst->data);
|
| 2989 |
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
|
| 2990 |
-
}
|
| 2991 |
}
|
| 2992 |
|
| 2993 |
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 2994 |
#ifdef GGML_VULKAN_DEBUG
|
| 2995 |
-
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
| 2996 |
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
| 2997 |
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
| 2998 |
#endif
|
| 2999 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
| 3000 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
@@ -3147,12 +3158,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
|
|
| 3147 |
|
| 3148 |
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3149 |
#ifdef GGML_VULKAN_DEBUG
|
| 3150 |
-
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
| 3151 |
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
| 3152 |
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
| 3153 |
#endif
|
| 3154 |
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
| 3155 |
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
| 3156 |
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
| 3157 |
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
| 3158 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
@@ -3217,25 +3227,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
|
|
| 3217 |
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3218 |
ggml_vk_sync_buffers(subctx);
|
| 3219 |
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
| 3220 |
-
|
| 3221 |
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
| 3222 |
-
// copy dst to host
|
| 3223 |
-
float * d = (float *) dst->data;
|
| 3224 |
-
ggml_vk_sync_buffers(subctx);
|
| 3225 |
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
| 3226 |
-
}
|
| 3227 |
}
|
| 3228 |
|
| 3229 |
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3230 |
#ifdef GGML_VULKAN_DEBUG
|
| 3231 |
-
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
| 3232 |
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
| 3233 |
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
| 3234 |
#endif
|
| 3235 |
GGML_ASSERT(!ggml_is_transposed(src0));
|
| 3236 |
GGML_ASSERT(!ggml_is_transposed(src1));
|
| 3237 |
GGML_ASSERT(!ggml_is_permuted(src0));
|
| 3238 |
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
| 3239 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 3240 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 3241 |
|
|
@@ -3302,26 +3304,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
|
|
| 3302 |
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3303 |
ggml_vk_sync_buffers(subctx);
|
| 3304 |
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
| 3305 |
-
|
| 3306 |
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
| 3307 |
-
// copy dst to host
|
| 3308 |
-
float * d = (float *) dst->data;
|
| 3309 |
-
ggml_vk_sync_buffers(subctx);
|
| 3310 |
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
|
| 3311 |
-
}
|
| 3312 |
-
}
|
| 3313 |
-
|
| 3314 |
-
static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
|
| 3315 |
-
const uint64_t ne10 = src1->ne[0];
|
| 3316 |
-
|
| 3317 |
-
const uint64_t ne0 = dst->ne[0];
|
| 3318 |
-
const uint64_t ne1 = dst->ne[1];
|
| 3319 |
-
|
| 3320 |
-
// TODO: find the optimal values for these
|
| 3321 |
-
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
| 3322 |
-
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
|
| 3323 |
-
dst->type == GGML_TYPE_F32 &&
|
| 3324 |
-
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
|
| 3325 |
}
|
| 3326 |
|
| 3327 |
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
@@ -3711,8 +3693,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
|
|
| 3711 |
// TODO: support for transposed / permuted tensors
|
| 3712 |
GGML_ASSERT(nb0 == sizeof(float));
|
| 3713 |
GGML_ASSERT(nb00 == sizeof(float));
|
| 3714 |
-
GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
|
| 3715 |
-
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
| 3716 |
|
| 3717 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 3718 |
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
@@ -3902,11 +3882,11 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
|
|
| 3902 |
template<typename PC>
|
| 3903 |
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
| 3904 |
#ifdef GGML_VULKAN_DEBUG
|
| 3905 |
-
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ",
|
| 3906 |
if (src1 != nullptr) {
|
| 3907 |
-
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ",
|
| 3908 |
}
|
| 3909 |
-
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ",
|
| 3910 |
#endif
|
| 3911 |
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
| 3912 |
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
|
@@ -3923,8 +3903,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3923 |
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
| 3924 |
const uint64_t ne1 = ne10 * ne11;
|
| 3925 |
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
| 3926 |
-
const uint64_t nb2 = dst->nb[2];
|
| 3927 |
-
const uint64_t nb3 = dst->nb[3];
|
| 3928 |
|
| 3929 |
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
|
| 3930 |
ggml_vk_func_t op_func;
|
|
@@ -3975,7 +3953,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 3975 |
vk_buffer d_D = extra->buffer_gpu.lock();
|
| 3976 |
|
| 3977 |
// Workaround for tiny tensor inputs on ROPE
|
| 3978 |
-
if (use_src1 &&
|
| 3979 |
y_sz = VK_WHOLE_SIZE;
|
| 3980 |
}
|
| 3981 |
|
|
@@ -4066,13 +4044,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 4066 |
ggml_vk_sync_buffers(subctx);
|
| 4067 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4068 |
}
|
| 4069 |
-
if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
|
| 4070 |
-
ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
|
| 4071 |
-
} else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
|
| 4072 |
-
// copy dst to host
|
| 4073 |
-
float * d = (float *) dst->data;
|
| 4074 |
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
|
| 4075 |
-
}
|
| 4076 |
} else {
|
| 4077 |
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
| 4078 |
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
|
@@ -4111,10 +4082,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
|
|
| 4111 |
ggml_vk_sync_buffers(subctx);
|
| 4112 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4113 |
}
|
| 4114 |
-
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
| 4115 |
-
// copy dst to host
|
| 4116 |
-
ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
|
| 4117 |
-
}
|
| 4118 |
}
|
| 4119 |
}
|
| 4120 |
}
|
|
@@ -4377,6 +4344,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4377 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4378 |
p = ctx->device->pipeline_matmul_f32->a_s;
|
| 4379 |
shname = "F32_ALIGNED_S";
|
|
|
|
|
|
|
|
|
|
| 4380 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4381 |
p = ctx->device->pipeline_matmul_f16_f32->a_s;
|
| 4382 |
shname = "F16_F32_ALIGNED_S";
|
|
@@ -4390,6 +4360,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4390 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4391 |
p = ctx->device->pipeline_matmul_f32->a_m;
|
| 4392 |
shname = "F32_ALIGNED_M";
|
|
|
|
|
|
|
|
|
|
| 4393 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4394 |
p = ctx->device->pipeline_matmul_f16_f32->a_m;
|
| 4395 |
shname = "F16_F32_ALIGNED_M";
|
|
@@ -4403,6 +4376,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4403 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4404 |
p = ctx->device->pipeline_matmul_f32->a_l;
|
| 4405 |
shname = "F32_ALIGNED_L";
|
|
|
|
|
|
|
|
|
|
| 4406 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4407 |
p = ctx->device->pipeline_matmul_f16_f32->a_l;
|
| 4408 |
shname = "F16_F32_ALIGNED_L";
|
|
@@ -4423,6 +4399,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4423 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4424 |
p = ctx->device->pipeline_matmul_f32->s;
|
| 4425 |
shname = "F32_S";
|
|
|
|
|
|
|
|
|
|
| 4426 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4427 |
p = ctx->device->pipeline_matmul_f16_f32->s;
|
| 4428 |
shname = "F16_F32_S";
|
|
@@ -4434,6 +4413,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4434 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4435 |
p = ctx->device->pipeline_matmul_f32->m;
|
| 4436 |
shname = "F32_M";
|
|
|
|
|
|
|
|
|
|
| 4437 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4438 |
p = ctx->device->pipeline_matmul_f16_f32->m;
|
| 4439 |
shname = "F16_F32_M";
|
|
@@ -4445,6 +4427,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4445 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4446 |
p = ctx->device->pipeline_matmul_f32->l;
|
| 4447 |
shname = "F32_L";
|
|
|
|
|
|
|
|
|
|
| 4448 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4449 |
p = ctx->device->pipeline_matmul_f16_f32->l;
|
| 4450 |
shname = "F16_F32_L";
|
|
@@ -4557,15 +4542,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
|
|
| 4557 |
src1_ggml->data = y;
|
| 4558 |
tensor_ggml->data = d_chk;
|
| 4559 |
|
| 4560 |
-
ctx->disable = true;
|
| 4561 |
-
|
| 4562 |
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
| 4563 |
ggml_build_forward_expand(cgraph, tensor_ggml);
|
| 4564 |
|
| 4565 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
| 4566 |
|
| 4567 |
-
ctx->disable = false;
|
| 4568 |
-
|
| 4569 |
ggml_free(ggml_ctx);
|
| 4570 |
|
| 4571 |
double avg_err = 0.0;
|
|
@@ -5045,15 +5026,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
|
|
| 5045 |
src1_ggml->data = y;
|
| 5046 |
tensor_ggml->data = d_chk;
|
| 5047 |
|
| 5048 |
-
ctx->disable = true;
|
| 5049 |
-
|
| 5050 |
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
| 5051 |
ggml_build_forward_expand(cgraph, tensor_ggml);
|
| 5052 |
|
| 5053 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
| 5054 |
|
| 5055 |
-
ctx->disable = false;
|
| 5056 |
-
|
| 5057 |
ggml_free(ggml_ctx);
|
| 5058 |
|
| 5059 |
double avg_err = 0.0;
|
|
@@ -5130,12 +5107,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
| 5130 |
#ifdef GGML_VULKAN_DEBUG
|
| 5131 |
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
| 5132 |
#endif
|
| 5133 |
-
|
|
|
|
|
|
|
| 5134 |
return;
|
| 5135 |
}
|
| 5136 |
|
| 5137 |
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
| 5138 |
-
|
| 5139 |
ggml_tensor * src0 = node->src[0];
|
| 5140 |
ggml_tensor * src1 = node->src[1];
|
| 5141 |
|
|
@@ -5240,9 +5217,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
|
|
| 5240 |
}
|
| 5241 |
|
| 5242 |
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
| 5243 |
-
if (ctx->disable) {
|
| 5244 |
-
return;
|
| 5245 |
-
}
|
| 5246 |
#ifdef GGML_VULKAN_DEBUG
|
| 5247 |
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
| 5248 |
#endif
|
|
@@ -5416,7 +5390,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
| 5416 |
}
|
| 5417 |
|
| 5418 |
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
| 5419 |
-
|
|
|
|
|
|
|
| 5420 |
return;
|
| 5421 |
}
|
| 5422 |
|
|
@@ -5429,8 +5405,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 5429 |
const ggml_tensor * src0 = node->src[0];
|
| 5430 |
const ggml_tensor * src1 = node->src[1];
|
| 5431 |
|
| 5432 |
-
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
| 5433 |
-
|
| 5434 |
switch (node->op) {
|
| 5435 |
case GGML_OP_UNARY:
|
| 5436 |
switch (ggml_get_unary_op(node)) {
|
|
@@ -5575,7 +5549,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 5575 |
last_node = true;
|
| 5576 |
#endif
|
| 5577 |
|
| 5578 |
-
if (
|
| 5579 |
ggml_vk_ctx_end(ctx->compute_ctx);
|
| 5580 |
ctx->compute_ctx->exit_tensor = node;
|
| 5581 |
ctx->compute_ctx = nullptr;
|
|
@@ -5583,10 +5557,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
|
| 5583 |
}
|
| 5584 |
|
| 5585 |
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
| 5586 |
-
if (ctx->disable) {
|
| 5587 |
-
return false;
|
| 5588 |
-
}
|
| 5589 |
-
|
| 5590 |
ggml_tensor_extra_gpu * extra = nullptr;
|
| 5591 |
|
| 5592 |
switch (tensor->op) {
|
|
@@ -5645,7 +5615,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 5645 |
}
|
| 5646 |
|
| 5647 |
#ifdef GGML_VULKAN_DEBUG
|
| 5648 |
-
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ",
|
| 5649 |
#endif
|
| 5650 |
|
| 5651 |
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
@@ -5685,9 +5655,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 5685 |
|
| 5686 |
// Clean up after graph processing is done
|
| 5687 |
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
| 5688 |
-
if (ctx->disable) {
|
| 5689 |
-
return;
|
| 5690 |
-
}
|
| 5691 |
#ifdef GGML_VULKAN_DEBUG
|
| 5692 |
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
| 5693 |
#endif
|
|
@@ -5860,7 +5827,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
|
|
| 5860 |
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
| 5861 |
}
|
| 5862 |
|
| 5863 |
-
tensor->backend = GGML_BACKEND_TYPE_GPU;
|
| 5864 |
tensor->extra = extra;
|
| 5865 |
}
|
| 5866 |
|
|
@@ -5868,8 +5834,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
|
|
| 5868 |
#ifdef GGML_VULKAN_DEBUG
|
| 5869 |
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
| 5870 |
#endif
|
| 5871 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 5872 |
-
|
| 5873 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 5874 |
|
| 5875 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
@@ -5883,8 +5847,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
|
|
| 5883 |
#ifdef GGML_VULKAN_DEBUG
|
| 5884 |
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
| 5885 |
#endif
|
| 5886 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 5887 |
-
|
| 5888 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 5889 |
|
| 5890 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
@@ -6027,6 +5989,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
|
|
| 6027 |
#ifdef GGML_VULKAN_DEBUG
|
| 6028 |
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
| 6029 |
#endif
|
|
|
|
| 6030 |
void * ptr = nullptr;
|
| 6031 |
try {
|
| 6032 |
ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
|
|
@@ -6114,7 +6077,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
|
|
| 6114 |
#endif
|
| 6115 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6116 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
| 6117 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 6118 |
|
| 6119 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 6120 |
|
|
@@ -6135,7 +6097,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
|
|
| 6135 |
#endif
|
| 6136 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6137 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
| 6138 |
-
GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
|
| 6139 |
|
| 6140 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 6141 |
|
|
@@ -6201,6 +6162,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
|
|
| 6201 |
ctx->transfer_ctx = nullptr;
|
| 6202 |
}
|
| 6203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6204 |
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 6205 |
#ifdef GGML_VULKAN_DEBUG
|
| 6206 |
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
|
@@ -6215,7 +6180,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
| 6215 |
int last_node = cgraph->n_nodes - 1;
|
| 6216 |
|
| 6217 |
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
| 6218 |
-
while (last_node > 0 && (cgraph->nodes[last_node]
|
| 6219 |
last_node -= 1;
|
| 6220 |
}
|
| 6221 |
|
|
@@ -6229,7 +6194,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
|
| 6229 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 6230 |
ggml_tensor * node = cgraph->nodes[i];
|
| 6231 |
|
| 6232 |
-
if (
|
| 6233 |
continue;
|
| 6234 |
}
|
| 6235 |
|
|
@@ -6873,16 +6838,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
|
|
| 6873 |
GGML_ASSERT(false);
|
| 6874 |
}
|
| 6875 |
|
| 6876 |
-
// Disable vulkan here to avoid the hooks in ggml.c
|
| 6877 |
-
ctx->disable = true;
|
| 6878 |
-
|
| 6879 |
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
| 6880 |
ggml_build_forward_expand(cgraph, tensor_clone);
|
| 6881 |
|
| 6882 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
| 6883 |
|
| 6884 |
-
ctx->disable = false;
|
| 6885 |
-
|
| 6886 |
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
| 6887 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6888 |
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|
|
|
|
| 114 |
size_t idx;
|
| 115 |
|
| 116 |
vk_matmul_pipeline pipeline_matmul_f32;
|
| 117 |
+
vk_matmul_pipeline pipeline_matmul_f32_f16;
|
| 118 |
vk_matmul_pipeline pipeline_matmul_f16;
|
| 119 |
vk_matmul_pipeline pipeline_matmul_f16_f32;
|
| 120 |
vk_pipeline pipeline_matmul_split_k_reduce;
|
|
|
|
| 376 |
vk_context * compute_ctx;
|
| 377 |
vk_context * transfer_ctx;
|
| 378 |
|
|
|
|
| 379 |
bool initialized;
|
| 380 |
|
| 381 |
size_t idx;
|
| 382 |
};
|
| 383 |
|
| 384 |
+
struct vk_instance_t {
|
| 385 |
vk::Instance instance;
|
| 386 |
|
| 387 |
std::vector<size_t> device_indices;
|
|
|
|
| 423 |
typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
| 424 |
|
| 425 |
static bool vk_instance_initialized = false;
|
| 426 |
+
static vk_instance_t vk_instance;
|
| 427 |
|
| 428 |
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
| 429 |
|
|
|
|
| 1013 |
uint32_t s_align = 32;
|
| 1014 |
|
| 1015 |
ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1016 |
+
ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1017 |
ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1018 |
ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1019 |
ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
|
|
|
|
| 1049 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
| 1050 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
| 1051 |
|
| 1052 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1053 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1054 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
| 1055 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
| 1056 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
| 1057 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
| 1058 |
+
|
| 1059 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1060 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1061 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
|
|
|
| 1238 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
| 1239 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
| 1240 |
|
| 1241 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1242 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1243 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
| 1244 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
|
| 1245 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
|
| 1246 |
+
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
|
| 1247 |
+
|
| 1248 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1249 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
| 1250 |
ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
|
|
|
|
| 1874 |
ctx->compute_ctx = nullptr;
|
| 1875 |
ctx->transfer_ctx = nullptr;
|
| 1876 |
|
|
|
|
| 1877 |
ctx->initialized = true;
|
| 1878 |
|
| 1879 |
ctx->idx = idx;
|
|
|
|
| 1917 |
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
|
| 1918 |
return ctx->device->pipeline_matmul_f32;
|
| 1919 |
}
|
| 1920 |
+
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
|
| 1921 |
+
return ctx->device->pipeline_matmul_f32_f16;
|
| 1922 |
+
}
|
| 1923 |
if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
|
| 1924 |
return ctx->device->pipeline_matmul_f16_f32;
|
| 1925 |
}
|
|
|
|
| 2739 |
uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
|
| 2740 |
uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
|
| 2741 |
#ifdef GGML_VULKAN_DEBUG
|
| 2742 |
+
std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
|
| 2743 |
#endif
|
| 2744 |
ggml_vk_sync_buffers(subctx);
|
| 2745 |
if (split_k == 1) {
|
|
|
|
| 2809 |
|
| 2810 |
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
|
| 2811 |
#ifdef GGML_VULKAN_DEBUG
|
| 2812 |
+
std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
|
| 2813 |
std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
|
| 2814 |
#endif
|
| 2815 |
const int tensor_type_size = ggml_type_size(tensor->type);
|
|
|
|
| 2829 |
|
| 2830 |
static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 2831 |
#ifdef GGML_VULKAN_DEBUG
|
| 2832 |
+
std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
| 2833 |
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
| 2834 |
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
| 2835 |
#endif
|
| 2836 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
| 2837 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
|
|
| 2999 |
ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
|
| 3000 |
0, 0, 0, 0, 1
|
| 3001 |
); // NOLINT
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3002 |
}
|
| 3003 |
|
| 3004 |
static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3005 |
#ifdef GGML_VULKAN_DEBUG
|
| 3006 |
+
std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
| 3007 |
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
| 3008 |
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
| 3009 |
#endif
|
| 3010 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
|
| 3011 |
GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
|
|
|
|
| 3158 |
|
| 3159 |
static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3160 |
#ifdef GGML_VULKAN_DEBUG
|
| 3161 |
+
std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
| 3162 |
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
| 3163 |
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
| 3164 |
#endif
|
| 3165 |
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
|
|
|
| 3166 |
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
|
| 3167 |
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
|
| 3168 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
|
|
|
| 3227 |
const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3228 |
ggml_vk_sync_buffers(subctx);
|
| 3229 |
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3230 |
}
|
| 3231 |
|
| 3232 |
static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3233 |
#ifdef GGML_VULKAN_DEBUG
|
| 3234 |
+
std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
| 3235 |
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
| 3236 |
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
|
| 3237 |
#endif
|
| 3238 |
GGML_ASSERT(!ggml_is_transposed(src0));
|
| 3239 |
GGML_ASSERT(!ggml_is_transposed(src1));
|
| 3240 |
GGML_ASSERT(!ggml_is_permuted(src0));
|
|
|
|
| 3241 |
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
| 3242 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| 3243 |
|
|
|
|
| 3304 |
const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
|
| 3305 |
ggml_vk_sync_buffers(subctx);
|
| 3306 |
ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3307 |
}
|
| 3308 |
|
| 3309 |
static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
|
|
|
| 3693 |
// TODO: support for transposed / permuted tensors
|
| 3694 |
GGML_ASSERT(nb0 == sizeof(float));
|
| 3695 |
GGML_ASSERT(nb00 == sizeof(float));
|
|
|
|
|
|
|
| 3696 |
|
| 3697 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
|
| 3698 |
ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
|
|
|
|
| 3882 |
template<typename PC>
|
| 3883 |
static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
|
| 3884 |
#ifdef GGML_VULKAN_DEBUG
|
| 3885 |
+
std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
|
| 3886 |
if (src1 != nullptr) {
|
| 3887 |
+
std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
|
| 3888 |
}
|
| 3889 |
+
std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
|
| 3890 |
#endif
|
| 3891 |
GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
|
| 3892 |
GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
|
|
|
|
| 3903 |
const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
|
| 3904 |
const uint64_t ne1 = ne10 * ne11;
|
| 3905 |
// const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
|
|
|
|
|
|
|
| 3906 |
|
| 3907 |
vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
|
| 3908 |
ggml_vk_func_t op_func;
|
|
|
|
| 3953 |
vk_buffer d_D = extra->buffer_gpu.lock();
|
| 3954 |
|
| 3955 |
// Workaround for tiny tensor inputs on ROPE
|
| 3956 |
+
if (use_src1 && y_sz > d_D->size) {
|
| 3957 |
y_sz = VK_WHOLE_SIZE;
|
| 3958 |
}
|
| 3959 |
|
|
|
|
| 4044 |
ggml_vk_sync_buffers(subctx);
|
| 4045 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4046 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4047 |
} else {
|
| 4048 |
GGML_ASSERT(op != GGML_OP_SOFT_MAX);
|
| 4049 |
GGML_ASSERT(op != GGML_OP_ARGSORT);
|
|
|
|
| 4082 |
ggml_vk_sync_buffers(subctx);
|
| 4083 |
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
|
| 4084 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4085 |
}
|
| 4086 |
}
|
| 4087 |
}
|
|
|
|
| 4344 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4345 |
p = ctx->device->pipeline_matmul_f32->a_s;
|
| 4346 |
shname = "F32_ALIGNED_S";
|
| 4347 |
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
| 4348 |
+
p = ctx->device->pipeline_matmul_f32_f16->a_s;
|
| 4349 |
+
shname = "F32_F16_ALIGNED_S";
|
| 4350 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4351 |
p = ctx->device->pipeline_matmul_f16_f32->a_s;
|
| 4352 |
shname = "F16_F32_ALIGNED_S";
|
|
|
|
| 4360 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4361 |
p = ctx->device->pipeline_matmul_f32->a_m;
|
| 4362 |
shname = "F32_ALIGNED_M";
|
| 4363 |
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
| 4364 |
+
p = ctx->device->pipeline_matmul_f32_f16->a_m;
|
| 4365 |
+
shname = "F32_F16_ALIGNED_M";
|
| 4366 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4367 |
p = ctx->device->pipeline_matmul_f16_f32->a_m;
|
| 4368 |
shname = "F16_F32_ALIGNED_M";
|
|
|
|
| 4376 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4377 |
p = ctx->device->pipeline_matmul_f32->a_l;
|
| 4378 |
shname = "F32_ALIGNED_L";
|
| 4379 |
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
| 4380 |
+
p = ctx->device->pipeline_matmul_f32_f16->a_l;
|
| 4381 |
+
shname = "F32_F16_ALIGNED_L";
|
| 4382 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4383 |
p = ctx->device->pipeline_matmul_f16_f32->a_l;
|
| 4384 |
shname = "F16_F32_ALIGNED_L";
|
|
|
|
| 4399 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4400 |
p = ctx->device->pipeline_matmul_f32->s;
|
| 4401 |
shname = "F32_S";
|
| 4402 |
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
| 4403 |
+
p = ctx->device->pipeline_matmul_f32_f16->s;
|
| 4404 |
+
shname = "F32_F16_S";
|
| 4405 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4406 |
p = ctx->device->pipeline_matmul_f16_f32->s;
|
| 4407 |
shname = "F16_F32_S";
|
|
|
|
| 4413 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4414 |
p = ctx->device->pipeline_matmul_f32->m;
|
| 4415 |
shname = "F32_M";
|
| 4416 |
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
| 4417 |
+
p = ctx->device->pipeline_matmul_f32_f16->m;
|
| 4418 |
+
shname = "F32_F16_M";
|
| 4419 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4420 |
p = ctx->device->pipeline_matmul_f16_f32->m;
|
| 4421 |
shname = "F16_F32_M";
|
|
|
|
| 4427 |
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4428 |
p = ctx->device->pipeline_matmul_f32->l;
|
| 4429 |
shname = "F32_L";
|
| 4430 |
+
} else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
|
| 4431 |
+
p = ctx->device->pipeline_matmul_f32_f16->l;
|
| 4432 |
+
shname = "F32_F16_L";
|
| 4433 |
} else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
|
| 4434 |
p = ctx->device->pipeline_matmul_f16_f32->l;
|
| 4435 |
shname = "F16_F32_L";
|
|
|
|
| 4542 |
src1_ggml->data = y;
|
| 4543 |
tensor_ggml->data = d_chk;
|
| 4544 |
|
|
|
|
|
|
|
| 4545 |
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
| 4546 |
ggml_build_forward_expand(cgraph, tensor_ggml);
|
| 4547 |
|
| 4548 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
| 4549 |
|
|
|
|
|
|
|
| 4550 |
ggml_free(ggml_ctx);
|
| 4551 |
|
| 4552 |
double avg_err = 0.0;
|
|
|
|
| 5026 |
src1_ggml->data = y;
|
| 5027 |
tensor_ggml->data = d_chk;
|
| 5028 |
|
|
|
|
|
|
|
| 5029 |
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
| 5030 |
ggml_build_forward_expand(cgraph, tensor_ggml);
|
| 5031 |
|
| 5032 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
|
| 5033 |
|
|
|
|
|
|
|
| 5034 |
ggml_free(ggml_ctx);
|
| 5035 |
|
| 5036 |
double avg_err = 0.0;
|
|
|
|
| 5107 |
#ifdef GGML_VULKAN_DEBUG
|
| 5108 |
std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
|
| 5109 |
#endif
|
| 5110 |
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
| 5111 |
+
|
| 5112 |
+
if (extra == nullptr) {
|
| 5113 |
return;
|
| 5114 |
}
|
| 5115 |
|
|
|
|
|
|
|
| 5116 |
ggml_tensor * src0 = node->src[0];
|
| 5117 |
ggml_tensor * src1 = node->src[1];
|
| 5118 |
|
|
|
|
| 5217 |
}
|
| 5218 |
|
| 5219 |
static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
|
|
|
|
|
|
|
|
|
| 5220 |
#ifdef GGML_VULKAN_DEBUG
|
| 5221 |
std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
|
| 5222 |
#endif
|
|
|
|
| 5390 |
}
|
| 5391 |
|
| 5392 |
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
|
| 5393 |
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
| 5394 |
+
|
| 5395 |
+
if (ggml_is_empty(node) || extra == nullptr) {
|
| 5396 |
return;
|
| 5397 |
}
|
| 5398 |
|
|
|
|
| 5405 |
const ggml_tensor * src0 = node->src[0];
|
| 5406 |
const ggml_tensor * src1 = node->src[1];
|
| 5407 |
|
|
|
|
|
|
|
| 5408 |
switch (node->op) {
|
| 5409 |
case GGML_OP_UNARY:
|
| 5410 |
switch (ggml_get_unary_op(node)) {
|
|
|
|
| 5549 |
last_node = true;
|
| 5550 |
#endif
|
| 5551 |
|
| 5552 |
+
if (last_node) {
|
| 5553 |
ggml_vk_ctx_end(ctx->compute_ctx);
|
| 5554 |
ctx->compute_ctx->exit_tensor = node;
|
| 5555 |
ctx->compute_ctx = nullptr;
|
|
|
|
| 5557 |
}
|
| 5558 |
|
| 5559 |
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5560 |
ggml_tensor_extra_gpu * extra = nullptr;
|
| 5561 |
|
| 5562 |
switch (tensor->op) {
|
|
|
|
| 5615 |
}
|
| 5616 |
|
| 5617 |
#ifdef GGML_VULKAN_DEBUG
|
| 5618 |
+
std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
|
| 5619 |
#endif
|
| 5620 |
|
| 5621 |
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
|
|
| 5655 |
|
| 5656 |
// Clean up after graph processing is done
|
| 5657 |
static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
|
|
|
|
|
|
|
|
|
|
| 5658 |
#ifdef GGML_VULKAN_DEBUG
|
| 5659 |
std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
|
| 5660 |
#endif
|
|
|
|
| 5827 |
extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
|
| 5828 |
}
|
| 5829 |
|
|
|
|
| 5830 |
tensor->extra = extra;
|
| 5831 |
}
|
| 5832 |
|
|
|
|
| 5834 |
#ifdef GGML_VULKAN_DEBUG
|
| 5835 |
std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
| 5836 |
#endif
|
|
|
|
|
|
|
| 5837 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 5838 |
|
| 5839 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
|
|
| 5847 |
#ifdef GGML_VULKAN_DEBUG
|
| 5848 |
std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
|
| 5849 |
#endif
|
|
|
|
|
|
|
| 5850 |
ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
|
| 5851 |
|
| 5852 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
|
|
|
| 5989 |
#ifdef GGML_VULKAN_DEBUG
|
| 5990 |
std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
|
| 5991 |
#endif
|
| 5992 |
+
size += 32; // Behave like the CPU buffer type
|
| 5993 |
void * ptr = nullptr;
|
| 5994 |
try {
|
| 5995 |
ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
|
|
|
|
| 6077 |
#endif
|
| 6078 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6079 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
|
|
| 6080 |
|
| 6081 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 6082 |
|
|
|
|
| 6097 |
#endif
|
| 6098 |
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
| 6099 |
GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
|
|
|
|
| 6100 |
|
| 6101 |
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
| 6102 |
|
|
|
|
| 6162 |
ctx->transfer_ctx = nullptr;
|
| 6163 |
}
|
| 6164 |
|
| 6165 |
+
static bool ggml_vk_is_empty(ggml_tensor * node) {
|
| 6166 |
+
return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
|
| 6167 |
+
}
|
| 6168 |
+
|
| 6169 |
GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
| 6170 |
#ifdef GGML_VULKAN_DEBUG
|
| 6171 |
std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
|
|
|
|
| 6180 |
int last_node = cgraph->n_nodes - 1;
|
| 6181 |
|
| 6182 |
// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
|
| 6183 |
+
while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
|
| 6184 |
last_node -= 1;
|
| 6185 |
}
|
| 6186 |
|
|
|
|
| 6194 |
for (int i = 0; i < cgraph->n_nodes; i++) {
|
| 6195 |
ggml_tensor * node = cgraph->nodes[i];
|
| 6196 |
|
| 6197 |
+
if (ggml_vk_is_empty(node)) {
|
| 6198 |
continue;
|
| 6199 |
}
|
| 6200 |
|
|
|
|
| 6838 |
GGML_ASSERT(false);
|
| 6839 |
}
|
| 6840 |
|
|
|
|
|
|
|
|
|
|
| 6841 |
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
|
| 6842 |
ggml_build_forward_expand(cgraph, tensor_clone);
|
| 6843 |
|
| 6844 |
ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
|
| 6845 |
|
|
|
|
|
|
|
| 6846 |
ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
|
| 6847 |
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
|
| 6848 |
ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
|