OccamRazor commited on
Commit
2bfeba3
·
1 Parent(s): 645c367

Vulkan Embedding Fix (llama/7360)

Browse files

* Fix empty Vulkan host buffers

Add fp32 fp16 matmul shader

Fix matmul shader alignment

* Remove deprecated tensor->backend uses

* Fix Vulkan validation errors on embedding models with no offloaded layers

* Fix Vulkan llava segfault when not offloading layers

Files changed (1) hide show
  1. ggml-vulkan.cpp +72 -112
ggml-vulkan.cpp CHANGED
@@ -114,6 +114,7 @@ struct vk_device {
114
  size_t idx;
115
 
116
  vk_matmul_pipeline pipeline_matmul_f32;
 
117
  vk_matmul_pipeline pipeline_matmul_f16;
118
  vk_matmul_pipeline pipeline_matmul_f16_f32;
119
  vk_pipeline pipeline_matmul_split_k_reduce;
@@ -375,13 +376,12 @@ struct ggml_backend_vk_context {
375
  vk_context * compute_ctx;
376
  vk_context * transfer_ctx;
377
 
378
- bool disable;
379
  bool initialized;
380
 
381
  size_t idx;
382
  };
383
 
384
- struct vk_instance {
385
  vk::Instance instance;
386
 
387
  std::vector<size_t> device_indices;
@@ -423,7 +423,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
423
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
424
 
425
  static bool vk_instance_initialized = false;
426
- static vk_instance vk_instance;
427
 
428
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
 
@@ -1013,6 +1013,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1013
  uint32_t s_align = 32;
1014
 
1015
  ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
 
1016
  ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1017
  ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1018
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
@@ -1048,6 +1049,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1048
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1049
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1050
 
 
 
 
 
 
 
 
1051
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1052
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1053
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1230,6 +1238,13 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1230
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1231
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1232
 
 
 
 
 
 
 
 
1233
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1234
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1235
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
@@ -1859,7 +1874,6 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
1859
  ctx->compute_ctx = nullptr;
1860
  ctx->transfer_ctx = nullptr;
1861
 
1862
- ctx->disable = false;
1863
  ctx->initialized = true;
1864
 
1865
  ctx->idx = idx;
@@ -1903,6 +1917,9 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte
1903
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1904
  return ctx->device->pipeline_matmul_f32;
1905
  }
 
 
 
1906
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
1907
  return ctx->device->pipeline_matmul_f16_f32;
1908
  }
@@ -2722,7 +2739,7 @@ static void ggml_vk_matmul(
2722
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2723
  uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
2724
  #ifdef GGML_VULKAN_DEBUG
2725
- std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2726
  #endif
2727
  ggml_vk_sync_buffers(subctx);
2728
  if (split_k == 1) {
@@ -2792,7 +2809,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
2792
 
2793
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2794
  #ifdef GGML_VULKAN_DEBUG
2795
- std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2796
  std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2797
  #endif
2798
  const int tensor_type_size = ggml_type_size(tensor->type);
@@ -2812,9 +2829,9 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
2812
 
2813
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2814
  #ifdef GGML_VULKAN_DEBUG
2815
- std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2816
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2817
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2818
  #endif
2819
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2820
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -2982,19 +2999,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
2982
  ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
2983
  0, 0, 0, 0, 1
2984
  ); // NOLINT
2985
-
2986
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
2987
- // copy dst to host
2988
- float * d = (float *) ((char *) dst->data);
2989
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13);
2990
- }
2991
  }
2992
 
2993
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2994
  #ifdef GGML_VULKAN_DEBUG
2995
- std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2996
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2997
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2998
  #endif
2999
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3000
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
@@ -3147,12 +3158,11 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
3147
 
3148
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3149
  #ifdef GGML_VULKAN_DEBUG
3150
- std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3151
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3152
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3153
  #endif
3154
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
3155
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3156
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3157
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
3158
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
@@ -3217,25 +3227,17 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
3217
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3218
  ggml_vk_sync_buffers(subctx);
3219
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3220
-
3221
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3222
- // copy dst to host
3223
- float * d = (float *) dst->data;
3224
- ggml_vk_sync_buffers(subctx);
3225
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3226
- }
3227
  }
3228
 
3229
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3230
  #ifdef GGML_VULKAN_DEBUG
3231
- std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3232
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3233
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3234
  #endif
3235
  GGML_ASSERT(!ggml_is_transposed(src0));
3236
  GGML_ASSERT(!ggml_is_transposed(src1));
3237
  GGML_ASSERT(!ggml_is_permuted(src0));
3238
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3239
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3240
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3241
 
@@ -3302,26 +3304,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
3302
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3303
  ggml_vk_sync_buffers(subctx);
3304
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
3305
-
3306
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
3307
- // copy dst to host
3308
- float * d = (float *) dst->data;
3309
- ggml_vk_sync_buffers(subctx);
3310
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset, d, sizeof(float) * d_ne);
3311
- }
3312
- }
3313
-
3314
- static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
3315
- const uint64_t ne10 = src1->ne[0];
3316
-
3317
- const uint64_t ne0 = dst->ne[0];
3318
- const uint64_t ne1 = dst->ne[1];
3319
-
3320
- // TODO: find the optimal values for these
3321
- return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
3322
- (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) &&
3323
- dst->type == GGML_TYPE_F32 &&
3324
- ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU);
3325
  }
3326
 
3327
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -3711,8 +3693,6 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3711
  // TODO: support for transposed / permuted tensors
3712
  GGML_ASSERT(nb0 == sizeof(float));
3713
  GGML_ASSERT(nb00 == sizeof(float));
3714
- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
3715
- GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
3716
 
3717
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3718
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
@@ -3902,11 +3882,11 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3902
  template<typename PC>
3903
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3904
  #ifdef GGML_VULKAN_DEBUG
3905
- std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3906
  if (src1 != nullptr) {
3907
- std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3908
  }
3909
- std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3910
  #endif
3911
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3912
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
@@ -3923,8 +3903,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3923
  const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
3924
  const uint64_t ne1 = ne10 * ne11;
3925
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
3926
- const uint64_t nb2 = dst->nb[2];
3927
- const uint64_t nb3 = dst->nb[3];
3928
 
3929
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
3930
  ggml_vk_func_t op_func;
@@ -3975,7 +3953,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3975
  vk_buffer d_D = extra->buffer_gpu.lock();
3976
 
3977
  // Workaround for tiny tensor inputs on ROPE
3978
- if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) {
3979
  y_sz = VK_WHOLE_SIZE;
3980
  }
3981
 
@@ -4066,13 +4044,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4066
  ggml_vk_sync_buffers(subctx);
4067
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4068
  }
4069
- if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) {
4070
- ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
4071
- } else if(dst->backend == GGML_BACKEND_TYPE_CPU) {
4072
- // copy dst to host
4073
- float * d = (float *) dst->data;
4074
- ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
4075
- }
4076
  } else {
4077
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4078
  GGML_ASSERT(op != GGML_OP_ARGSORT);
@@ -4111,10 +4082,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4111
  ggml_vk_sync_buffers(subctx);
4112
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4113
  }
4114
- if (dst->backend == GGML_BACKEND_TYPE_CPU) {
4115
- // copy dst to host
4116
- ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz);
4117
- }
4118
  }
4119
  }
4120
  }
@@ -4377,6 +4344,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4377
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4378
  p = ctx->device->pipeline_matmul_f32->a_s;
4379
  shname = "F32_ALIGNED_S";
 
 
 
4380
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4381
  p = ctx->device->pipeline_matmul_f16_f32->a_s;
4382
  shname = "F16_F32_ALIGNED_S";
@@ -4390,6 +4360,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4390
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4391
  p = ctx->device->pipeline_matmul_f32->a_m;
4392
  shname = "F32_ALIGNED_M";
 
 
 
4393
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4394
  p = ctx->device->pipeline_matmul_f16_f32->a_m;
4395
  shname = "F16_F32_ALIGNED_M";
@@ -4403,6 +4376,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4403
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4404
  p = ctx->device->pipeline_matmul_f32->a_l;
4405
  shname = "F32_ALIGNED_L";
 
 
 
4406
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4407
  p = ctx->device->pipeline_matmul_f16_f32->a_l;
4408
  shname = "F16_F32_ALIGNED_L";
@@ -4423,6 +4399,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4423
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4424
  p = ctx->device->pipeline_matmul_f32->s;
4425
  shname = "F32_S";
 
 
 
4426
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4427
  p = ctx->device->pipeline_matmul_f16_f32->s;
4428
  shname = "F16_F32_S";
@@ -4434,6 +4413,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4434
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4435
  p = ctx->device->pipeline_matmul_f32->m;
4436
  shname = "F32_M";
 
 
 
4437
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4438
  p = ctx->device->pipeline_matmul_f16_f32->m;
4439
  shname = "F16_F32_M";
@@ -4445,6 +4427,9 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4445
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4446
  p = ctx->device->pipeline_matmul_f32->l;
4447
  shname = "F32_L";
 
 
 
4448
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4449
  p = ctx->device->pipeline_matmul_f16_f32->l;
4450
  shname = "F16_F32_L";
@@ -4557,15 +4542,11 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
4557
  src1_ggml->data = y;
4558
  tensor_ggml->data = d_chk;
4559
 
4560
- ctx->disable = true;
4561
-
4562
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
4563
  ggml_build_forward_expand(cgraph, tensor_ggml);
4564
 
4565
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
4566
 
4567
- ctx->disable = false;
4568
-
4569
  ggml_free(ggml_ctx);
4570
 
4571
  double avg_err = 0.0;
@@ -5045,15 +5026,11 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m,
5045
  src1_ggml->data = y;
5046
  tensor_ggml->data = d_chk;
5047
 
5048
- ctx->disable = true;
5049
-
5050
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
5051
  ggml_build_forward_expand(cgraph, tensor_ggml);
5052
 
5053
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
5054
 
5055
- ctx->disable = false;
5056
-
5057
  ggml_free(ggml_ctx);
5058
 
5059
  double avg_err = 0.0;
@@ -5130,12 +5107,12 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5130
  #ifdef GGML_VULKAN_DEBUG
5131
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5132
  #endif
5133
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) {
 
 
5134
  return;
5135
  }
5136
 
5137
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5138
-
5139
  ggml_tensor * src0 = node->src[0];
5140
  ggml_tensor * src1 = node->src[1];
5141
 
@@ -5240,9 +5217,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
5240
  }
5241
 
5242
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5243
- if (ctx->disable) {
5244
- return;
5245
- }
5246
  #ifdef GGML_VULKAN_DEBUG
5247
  std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5248
  #endif
@@ -5416,7 +5390,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
5416
  }
5417
 
5418
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5419
- if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(node)) {
 
 
5420
  return;
5421
  }
5422
 
@@ -5429,8 +5405,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5429
  const ggml_tensor * src0 = node->src[0];
5430
  const ggml_tensor * src1 = node->src[1];
5431
 
5432
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5433
-
5434
  switch (node->op) {
5435
  case GGML_OP_UNARY:
5436
  switch (ggml_get_unary_op(node)) {
@@ -5575,7 +5549,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5575
  last_node = true;
5576
  #endif
5577
 
5578
- if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) {
5579
  ggml_vk_ctx_end(ctx->compute_ctx);
5580
  ctx->compute_ctx->exit_tensor = node;
5581
  ctx->compute_ctx = nullptr;
@@ -5583,10 +5557,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5583
  }
5584
 
5585
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
5586
- if (ctx->disable) {
5587
- return false;
5588
- }
5589
-
5590
  ggml_tensor_extra_gpu * extra = nullptr;
5591
 
5592
  switch (tensor->op) {
@@ -5645,7 +5615,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5645
  }
5646
 
5647
  #ifdef GGML_VULKAN_DEBUG
5648
- std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5649
  #endif
5650
 
5651
  #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5685,9 +5655,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
5685
 
5686
  // Clean up after graph processing is done
5687
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
5688
- if (ctx->disable) {
5689
- return;
5690
- }
5691
  #ifdef GGML_VULKAN_DEBUG
5692
  std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5693
  #endif
@@ -5860,7 +5827,6 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
5860
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5861
  }
5862
 
5863
- tensor->backend = GGML_BACKEND_TYPE_GPU;
5864
  tensor->extra = extra;
5865
  }
5866
 
@@ -5868,8 +5834,6 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
5868
  #ifdef GGML_VULKAN_DEBUG
5869
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5870
  #endif
5871
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5872
-
5873
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5874
 
5875
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -5883,8 +5847,6 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
5883
  #ifdef GGML_VULKAN_DEBUG
5884
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5885
  #endif
5886
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
5887
-
5888
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5889
 
5890
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
@@ -6027,6 +5989,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
6027
  #ifdef GGML_VULKAN_DEBUG
6028
  std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6029
  #endif
 
6030
  void * ptr = nullptr;
6031
  try {
6032
  ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
@@ -6114,7 +6077,6 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
6114
  #endif
6115
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6116
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6117
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6118
 
6119
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6120
 
@@ -6135,7 +6097,6 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
6135
  #endif
6136
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6137
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
6138
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
6139
 
6140
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6141
 
@@ -6201,6 +6162,10 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
6201
  ctx->transfer_ctx = nullptr;
6202
  }
6203
 
 
 
 
 
6204
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6205
  #ifdef GGML_VULKAN_DEBUG
6206
  std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
@@ -6215,7 +6180,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6215
  int last_node = cgraph->n_nodes - 1;
6216
 
6217
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
6218
- while (last_node > 0 && (cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(cgraph->nodes[last_node]))) {
6219
  last_node -= 1;
6220
  }
6221
 
@@ -6229,7 +6194,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
6229
  for (int i = 0; i < cgraph->n_nodes; i++) {
6230
  ggml_tensor * node = cgraph->nodes[i];
6231
 
6232
- if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
6233
  continue;
6234
  }
6235
 
@@ -6873,16 +6838,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6873
  GGML_ASSERT(false);
6874
  }
6875
 
6876
- // Disable vulkan here to avoid the hooks in ggml.c
6877
- ctx->disable = true;
6878
-
6879
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
6880
  ggml_build_forward_expand(cgraph, tensor_clone);
6881
 
6882
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6883
 
6884
- ctx->disable = false;
6885
-
6886
  ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
6887
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6888
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
 
114
  size_t idx;
115
 
116
  vk_matmul_pipeline pipeline_matmul_f32;
117
+ vk_matmul_pipeline pipeline_matmul_f32_f16;
118
  vk_matmul_pipeline pipeline_matmul_f16;
119
  vk_matmul_pipeline pipeline_matmul_f16_f32;
120
  vk_pipeline pipeline_matmul_split_k_reduce;
 
376
  vk_context * compute_ctx;
377
  vk_context * transfer_ctx;
378
 
 
379
  bool initialized;
380
 
381
  size_t idx;
382
  };
383
 
384
+ struct vk_instance_t {
385
  vk::Instance instance;
386
 
387
  std::vector<size_t> device_indices;
 
423
  typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
424
 
425
  static bool vk_instance_initialized = false;
426
+ static vk_instance_t vk_instance;
427
 
428
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
429
 
 
1013
  uint32_t s_align = 32;
1014
 
1015
  ctx->device->pipeline_matmul_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1016
+ ctx->device->pipeline_matmul_f32_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1017
  ctx->device->pipeline_matmul_f16_f32 = std::make_shared<vk_matmul_pipeline_struct>();
1018
  ctx->device->pipeline_matmul_f16 = std::make_shared<vk_matmul_pipeline_struct>();
1019
  ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0] = std::make_shared<vk_matmul_pipeline_struct>();
 
1049
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1050
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1051
 
1052
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1053
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1054
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_len, matmul_f32_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1055
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1056
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1057
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_len, matmul_f32_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1058
+
1059
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1060
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1061
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
 
1238
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1239
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1240
 
1241
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->l, "matmul_f32_f16_l", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1242
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->m, "matmul_f32_f16_m", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1243
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->s, "matmul_f32_f16_s", matmul_f32_f16_fp32_len, matmul_f32_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
1244
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_l, "matmul_f32_f16_aligned_l", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align);
1245
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_m, "matmul_f32_f16_aligned_m", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align);
1246
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_f16->a_s, "matmul_f32_f16_aligned_s", matmul_f32_f16_aligned_fp32_len, matmul_f32_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align);
1247
+
1248
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1249
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
1250
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1);
 
1874
  ctx->compute_ctx = nullptr;
1875
  ctx->transfer_ctx = nullptr;
1876
 
 
1877
  ctx->initialized = true;
1878
 
1879
  ctx->idx = idx;
 
1917
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
1918
  return ctx->device->pipeline_matmul_f32;
1919
  }
1920
+ if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
1921
+ return ctx->device->pipeline_matmul_f32_f16;
1922
+ }
1923
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) {
1924
  return ctx->device->pipeline_matmul_f16_f32;
1925
  }
 
2739
  uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d,
2740
  uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) {
2741
  #ifdef GGML_VULKAN_DEBUG
2742
+ std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl;
2743
  #endif
2744
  ggml_vk_sync_buffers(subctx);
2745
  if (split_k == 1) {
 
2809
 
2810
  static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
2811
  #ifdef GGML_VULKAN_DEBUG
2812
+ std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
2813
  std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
2814
  #endif
2815
  const int tensor_type_size = ggml_type_size(tensor->type);
 
2829
 
2830
  static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
2831
  #ifdef GGML_VULKAN_DEBUG
2832
+ std::cerr << "ggml_vk_mul_mat_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
2833
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
2834
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
2835
  #endif
2836
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
2837
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
 
2999
  ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21,
3000
  0, 0, 0, 0, 1
3001
  ); // NOLINT
 
 
 
 
 
 
3002
  }
3003
 
3004
  static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3005
  #ifdef GGML_VULKAN_DEBUG
3006
+ std::cerr << "ggml_vk_mul_mat_vec_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3007
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3008
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3009
  #endif
3010
  GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT
3011
  GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT
 
3158
 
3159
  static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3160
  #ifdef GGML_VULKAN_DEBUG
3161
+ std::cerr << "ggml_vk_mul_mat_p021_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3162
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3163
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3164
  #endif
3165
  GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
 
3166
  GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT
3167
  GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT
3168
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
 
3227
  const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3228
  ggml_vk_sync_buffers(subctx);
3229
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
 
 
 
 
 
 
 
3230
  }
3231
 
3232
  static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
3233
  #ifdef GGML_VULKAN_DEBUG
3234
+ std::cerr << "ggml_vk_mul_mat_nc_f16_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3235
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3236
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl;
3237
  #endif
3238
  GGML_ASSERT(!ggml_is_transposed(src0));
3239
  GGML_ASSERT(!ggml_is_transposed(src1));
3240
  GGML_ASSERT(!ggml_is_permuted(src0));
 
3241
  GGML_ASSERT(src0->type == GGML_TYPE_F16);
3242
  GGML_ASSERT(src1->type == GGML_TYPE_F32);
3243
 
 
3304
  const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
3305
  ggml_vk_sync_buffers(subctx);
3306
  ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3307
  }
3308
 
3309
  static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
3693
  // TODO: support for transposed / permuted tensors
3694
  GGML_ASSERT(nb0 == sizeof(float));
3695
  GGML_ASSERT(nb00 == sizeof(float));
 
 
3696
 
3697
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3698
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
 
3882
  template<typename PC>
3883
  static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3884
  #ifdef GGML_VULKAN_DEBUG
3885
+ std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3886
  if (src1 != nullptr) {
3887
+ std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3888
  }
3889
+ std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3890
  #endif
3891
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
3892
  GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT
 
3903
  const uint64_t ne13 = use_src1 ? src1->ne[3] : 0;
3904
  const uint64_t ne1 = ne10 * ne11;
3905
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
 
 
3906
 
3907
  vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
3908
  ggml_vk_func_t op_func;
 
3953
  vk_buffer d_D = extra->buffer_gpu.lock();
3954
 
3955
  // Workaround for tiny tensor inputs on ROPE
3956
+ if (use_src1 && y_sz > d_D->size) {
3957
  y_sz = VK_WHOLE_SIZE;
3958
  }
3959
 
 
4044
  ggml_vk_sync_buffers(subctx);
4045
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4046
  }
 
 
 
 
 
 
 
4047
  } else {
4048
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4049
  GGML_ASSERT(op != GGML_OP_ARGSORT);
 
4082
  ggml_vk_sync_buffers(subctx);
4083
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
4084
  }
 
 
 
 
4085
  }
4086
  }
4087
  }
 
4344
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4345
  p = ctx->device->pipeline_matmul_f32->a_s;
4346
  shname = "F32_ALIGNED_S";
4347
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4348
+ p = ctx->device->pipeline_matmul_f32_f16->a_s;
4349
+ shname = "F32_F16_ALIGNED_S";
4350
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4351
  p = ctx->device->pipeline_matmul_f16_f32->a_s;
4352
  shname = "F16_F32_ALIGNED_S";
 
4360
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4361
  p = ctx->device->pipeline_matmul_f32->a_m;
4362
  shname = "F32_ALIGNED_M";
4363
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4364
+ p = ctx->device->pipeline_matmul_f32_f16->a_m;
4365
+ shname = "F32_F16_ALIGNED_M";
4366
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4367
  p = ctx->device->pipeline_matmul_f16_f32->a_m;
4368
  shname = "F16_F32_ALIGNED_M";
 
4376
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4377
  p = ctx->device->pipeline_matmul_f32->a_l;
4378
  shname = "F32_ALIGNED_L";
4379
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4380
+ p = ctx->device->pipeline_matmul_f32_f16->a_l;
4381
+ shname = "F32_F16_ALIGNED_L";
4382
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4383
  p = ctx->device->pipeline_matmul_f16_f32->a_l;
4384
  shname = "F16_F32_ALIGNED_L";
 
4399
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4400
  p = ctx->device->pipeline_matmul_f32->s;
4401
  shname = "F32_S";
4402
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4403
+ p = ctx->device->pipeline_matmul_f32_f16->s;
4404
+ shname = "F32_F16_S";
4405
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4406
  p = ctx->device->pipeline_matmul_f16_f32->s;
4407
  shname = "F16_F32_S";
 
4413
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4414
  p = ctx->device->pipeline_matmul_f32->m;
4415
  shname = "F32_M";
4416
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4417
+ p = ctx->device->pipeline_matmul_f32_f16->m;
4418
+ shname = "F32_F16_M";
4419
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4420
  p = ctx->device->pipeline_matmul_f16_f32->m;
4421
  shname = "F16_F32_M";
 
4427
  if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4428
  p = ctx->device->pipeline_matmul_f32->l;
4429
  shname = "F32_L";
4430
+ } else if (std::is_same<float, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
4431
+ p = ctx->device->pipeline_matmul_f32_f16->l;
4432
+ shname = "F32_F16_L";
4433
  } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
4434
  p = ctx->device->pipeline_matmul_f16_f32->l;
4435
  shname = "F16_F32_L";
 
4542
  src1_ggml->data = y;
4543
  tensor_ggml->data = d_chk;
4544
 
 
 
4545
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
4546
  ggml_build_forward_expand(cgraph, tensor_ggml);
4547
 
4548
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
4549
 
 
 
4550
  ggml_free(ggml_ctx);
4551
 
4552
  double avg_err = 0.0;
 
5026
  src1_ggml->data = y;
5027
  tensor_ggml->data = d_chk;
5028
 
 
 
5029
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
5030
  ggml_build_forward_expand(cgraph, tensor_ggml);
5031
 
5032
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 1);
5033
 
 
 
5034
  ggml_free(ggml_ctx);
5035
 
5036
  double avg_err = 0.0;
 
5107
  #ifdef GGML_VULKAN_DEBUG
5108
  std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl;
5109
  #endif
5110
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5111
+
5112
+ if (extra == nullptr) {
5113
  return;
5114
  }
5115
 
 
 
5116
  ggml_tensor * src0 = node->src[0];
5117
  ggml_tensor * src1 = node->src[1];
5118
 
 
5217
  }
5218
 
5219
  static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
 
 
 
5220
  #ifdef GGML_VULKAN_DEBUG
5221
  std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl;
5222
  #endif
 
5390
  }
5391
 
5392
  static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){
5393
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
5394
+
5395
+ if (ggml_is_empty(node) || extra == nullptr) {
5396
  return;
5397
  }
5398
 
 
5405
  const ggml_tensor * src0 = node->src[0];
5406
  const ggml_tensor * src1 = node->src[1];
5407
 
 
 
5408
  switch (node->op) {
5409
  case GGML_OP_UNARY:
5410
  switch (ggml_get_unary_op(node)) {
 
5549
  last_node = true;
5550
  #endif
5551
 
5552
+ if (last_node) {
5553
  ggml_vk_ctx_end(ctx->compute_ctx);
5554
  ctx->compute_ctx->exit_tensor = node;
5555
  ctx->compute_ctx = nullptr;
 
5557
  }
5558
 
5559
  static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){
 
 
 
 
5560
  ggml_tensor_extra_gpu * extra = nullptr;
5561
 
5562
  switch (tensor->op) {
 
5615
  }
5616
 
5617
  #ifdef GGML_VULKAN_DEBUG
5618
+ std::cerr << "ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")" << std::endl;
5619
  #endif
5620
 
5621
  #ifdef GGML_VULKAN_CHECK_RESULTS
 
5655
 
5656
  // Clean up after graph processing is done
5657
  static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
 
 
 
5658
  #ifdef GGML_VULKAN_DEBUG
5659
  std::cerr << "ggml_vk_graph_cleanup()" << std::endl;
5660
  #endif
 
5827
  extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
5828
  }
5829
 
 
5830
  tensor->extra = extra;
5831
  }
5832
 
 
5834
  #ifdef GGML_VULKAN_DEBUG
5835
  std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5836
  #endif
 
 
5837
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5838
 
5839
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
5847
  #ifdef GGML_VULKAN_DEBUG
5848
  std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl;
5849
  #endif
 
 
5850
  ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
5851
 
5852
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
5989
  #ifdef GGML_VULKAN_DEBUG
5990
  std::cerr << "ggml_backend_vk_host_buffer_type_alloc_buffer(" << size << ")" << std::endl;
5991
  #endif
5992
+ size += 32; // Behave like the CPU buffer type
5993
  void * ptr = nullptr;
5994
  try {
5995
  ptr = ggml_vk_host_malloc(&vk_instance.contexts[0], size);
 
6077
  #endif
6078
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6079
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
 
6080
 
6081
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6082
 
 
6097
  #endif
6098
  ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
6099
  GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type");
 
6100
 
6101
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
6102
 
 
6162
  ctx->transfer_ctx = nullptr;
6163
  }
6164
 
6165
+ static bool ggml_vk_is_empty(ggml_tensor * node) {
6166
+ return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
6167
+ }
6168
+
6169
  GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
6170
  #ifdef GGML_VULKAN_DEBUG
6171
  std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl;
 
6180
  int last_node = cgraph->n_nodes - 1;
6181
 
6182
  // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
6183
+ while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
6184
  last_node -= 1;
6185
  }
6186
 
 
6194
  for (int i = 0; i < cgraph->n_nodes; i++) {
6195
  ggml_tensor * node = cgraph->nodes[i];
6196
 
6197
+ if (ggml_vk_is_empty(node)) {
6198
  continue;
6199
  }
6200
 
 
6838
  GGML_ASSERT(false);
6839
  }
6840
 
 
 
 
6841
  ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
6842
  ggml_build_forward_expand(cgraph, tensor_clone);
6843
 
6844
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6845
 
 
 
6846
  ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
6847
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6848
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");