OccamRazor commited on
Commit
be0ec58
·
1 Parent(s): b38d0f9

Update vulkan rope implementation to support frequency factors (llama/7475)

Browse files
Files changed (1) hide show
  1. ggml-vulkan.cpp +168 -91
ggml-vulkan.cpp CHANGED
@@ -290,6 +290,7 @@ struct vk_op_rope_neox_push_constants {
290
  float corr_dims[4];
291
  float theta_scale;
292
  float inv_ndims;
 
293
  };
294
 
295
  struct vk_op_soft_max_push_constants {
@@ -1522,8 +1523,8 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1522
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1524
 
1525
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1526
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1527
 
1528
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1529
  }
@@ -3732,7 +3733,7 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
3732
  }
3733
 
3734
 
3735
- static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
3736
  switch (op) {
3737
  case GGML_OP_ADD:
3738
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
@@ -3853,6 +3854,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3853
  default:
3854
  return nullptr;
3855
  }
 
 
3856
  }
3857
 
3858
  static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
@@ -3880,12 +3883,15 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
3880
  }
3881
 
3882
  template<typename PC>
3883
- static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3884
  #ifdef GGML_VULKAN_DEBUG
3885
  std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3886
  if (src1 != nullptr) {
3887
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3888
  }
 
 
 
3889
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3890
  #endif
3891
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
@@ -3896,6 +3902,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3896
  const uint64_t ne02 = src0->ne[2];
3897
  const uint64_t ne03 = src0->ne[3];
3898
  const uint64_t ne0 = ne00 * ne01;
 
3899
  const bool use_src1 = src1 != nullptr;
3900
  const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
3901
  const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
@@ -3904,7 +3911,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3904
  const uint64_t ne1 = ne10 * ne11;
3905
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
3906
 
3907
- vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
 
 
 
 
 
 
 
3908
  ggml_vk_func_t op_func;
3909
 
3910
  if (pipeline == nullptr) {
@@ -3927,15 +3941,18 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3927
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3928
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3929
  ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
 
3930
 
3931
  vk_buffer d_X = nullptr;
3932
  size_t x_buf_offset = 0;
3933
  vk_buffer d_Y = nullptr;
3934
  size_t y_buf_offset = 0;
3935
  vk_buffer d_Z = nullptr;
 
3936
 
3937
  bool src0_uma = false;
3938
  bool src1_uma = false;
 
3939
 
3940
  if (ctx->device->uma) {
3941
  ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
@@ -3944,10 +3961,15 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3944
  ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
3945
  src1_uma = d_Y != nullptr;
3946
  }
 
 
 
 
3947
  }
3948
 
3949
  uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3950
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
 
3951
  uint64_t d_sz = ggml_type_size(dst->type) * ne0;
3952
 
3953
  vk_buffer d_D = extra->buffer_gpu.lock();
@@ -3970,10 +3992,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3970
  y_buf_offset = extra_src1->offset;
3971
  GGML_ASSERT(d_Y != nullptr);
3972
  }
 
 
 
 
 
3973
 
3974
  if (op_supports_incontiguous) {
3975
  x_sz = ggml_nbytes(src0);
3976
  y_sz = use_src1 ? ggml_nbytes(src1) : 0;
 
3977
  d_sz = ggml_nbytes(dst);
3978
 
3979
  if (x_buf_offset + x_sz >= d_X->size) {
@@ -3982,6 +4010,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
3982
  if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
3983
  y_sz = VK_WHOLE_SIZE;
3984
  }
 
 
 
3985
  if (d_buf_offset + d_sz >= d_D->size) {
3986
  d_sz = VK_WHOLE_SIZE;
3987
  }
@@ -4021,13 +4052,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4021
  if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4022
  y_sz *= ne12 * ne13;
4023
  }
 
 
 
4024
  if (d_sz != VK_WHOLE_SIZE) {
4025
  d_sz *= ne02 * ne03;
4026
  }
4027
  }
4028
 
4029
  if (op == GGML_OP_SOFT_MAX) {
4030
- // Empty src1 is possible on soft_max, but the shader needs a buffer
4031
  vk_subbuffer subbuf_y;
4032
  if (use_src1) {
4033
  subbuf_y = { d_Y, y_buf_offset, y_sz };
@@ -4037,6 +4071,28 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4037
 
4038
  ggml_vk_sync_buffers(subctx);
4039
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4040
  } else if (use_src1) {
4041
  ggml_vk_sync_buffers(subctx);
4042
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4047,6 +4103,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4047
  } else {
4048
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4049
  GGML_ASSERT(op != GGML_OP_ARGSORT);
 
4050
 
4051
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
4052
 
@@ -4088,7 +4145,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4088
  }
4089
 
4090
  static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4091
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
4092
  }
4093
 
4094
  static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4096,7 +4153,7 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx,
4096
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4097
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4098
 
4099
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_GET_ROWS, {
4100
  (uint32_t)ggml_nelements(src0),
4101
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4102
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
@@ -4111,7 +4168,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4111
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4112
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4113
 
4114
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ADD, {
4115
  (uint32_t)ggml_nelements(src0),
4116
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4117
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
@@ -4126,7 +4183,7 @@ static void ggml_vk_mul(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4126
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4127
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4128
 
4129
- ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_MUL, {
4130
  (uint32_t)ggml_nelements(src0),
4131
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4132
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
@@ -4141,7 +4198,7 @@ static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context * subctx, co
4141
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4142
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4143
 
4144
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SCALE, {
4145
  (uint32_t)ggml_nelements(src0),
4146
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4147
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
@@ -4154,7 +4211,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4154
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4155
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4156
 
4157
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_SQR, {
4158
  (uint32_t)ggml_nelements(src0),
4159
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4160
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
@@ -4168,7 +4225,7 @@ static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context * subctx, co
4168
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4169
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4170
 
4171
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CLAMP, {
4172
  (uint32_t)ggml_nelements(src0),
4173
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4174
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
@@ -4183,7 +4240,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4183
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4184
  const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4185
 
4186
- ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
4187
  (uint32_t)ggml_nelements(src0),
4188
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4189
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
@@ -4195,21 +4252,21 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
4195
  static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4196
  float * op_params = (float *)dst->op_params;
4197
 
4198
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4199
  }
4200
 
4201
  static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4202
  float * op_params = (float *)dst->op_params;
4203
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4204
  }
4205
 
4206
  static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4207
- ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
4208
  }
4209
 
4210
  static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4211
  int32_t * op_params = (int32_t *)dst->op_params;
4212
- ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4213
  }
4214
 
4215
  static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -4228,7 +4285,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4228
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
4229
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
4230
 
4231
- ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_SOFT_MAX, {
4232
  ncols,
4233
  src1 != nullptr ? nrows_y : (uint32_t)0,
4234
  scale, max_bias,
@@ -4237,11 +4294,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4237
  });
4238
  }
4239
 
4240
- static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4241
- #pragma message("TODO: implement phi3 frequency factors support")
4242
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
4243
- GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
4244
-
4245
  const int n_dims = ((int32_t *) dst->op_params)[1];
4246
  const int mode = ((int32_t *) dst->op_params)[2];
4247
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
@@ -4264,12 +4317,13 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4264
  if (is_neox) {
4265
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
4266
  const float inv_ndims = -1.0f / n_dims;
4267
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
4268
  (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4269
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims
 
4270
  });
4271
  } else {
4272
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, dst, GGML_OP_ROPE, {
4273
  (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4274
  freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4275
  });
@@ -4292,7 +4346,7 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx,
4292
 
4293
  std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
4294
 
4295
- ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_ARGSORT, {
4296
  ncols,
4297
  ncols_pad,
4298
  op_params[0],
@@ -5408,6 +5462,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5408
 
5409
  const ggml_tensor * src0 = node->src[0];
5410
  const ggml_tensor * src1 = node->src[1];
 
5411
 
5412
  switch (node->op) {
5413
  case GGML_OP_UNARY:
@@ -5524,7 +5579,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
5524
 
5525
  break;
5526
  case GGML_OP_ROPE:
5527
- ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, node);
5528
 
5529
  break;
5530
  case GGML_OP_ARGSORT:
@@ -6500,7 +6555,7 @@ static void ggml_vk_print_graph_origin(const ggml_tensor * tensor, std::vector<c
6500
  for (int j = 0; j < level; j++) {
6501
  std::cerr << " ";
6502
  }
6503
- std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << " backend=" << tensor->backend << std::endl;
6504
 
6505
  done.push_back(tensor);
6506
 
@@ -6550,7 +6605,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
6550
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6551
  void * tensor_data = tensor->data;
6552
 
6553
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6554
  const size_t tensor_size = ggml_nbytes(tensor);
6555
  tensor_data = malloc(tensor_size);
6556
 
@@ -6561,12 +6616,12 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6561
  }
6562
 
6563
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
6564
- std::cerr << "tensor=" << tensor << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6565
  if (tensor->src[0] != nullptr) {
6566
- std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " backend=" << tensor->src[0]->backend << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6567
  }
6568
  if (tensor->src[1] != nullptr) {
6569
- std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " backend=" << tensor->src[1]->backend << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6570
  }
6571
  std::cerr << std::endl << "Result:" << std::endl;
6572
  ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
@@ -6577,43 +6632,11 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
6577
  std::vector<const ggml_tensor *> done;
6578
  ggml_vk_print_graph_origin(tensor, done);
6579
 
6580
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6581
  free(tensor_data);
6582
  }
6583
  }
6584
 
6585
- static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) {
6586
- return;
6587
- GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU);
6588
- if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) {
6589
- return;
6590
- }
6591
- for (int i3 = 0; i3 < tensor->ne[3]; i3++) {
6592
- for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
6593
- for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
6594
- for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
6595
- float val = 0.0f;
6596
- if (tensor->type == GGML_TYPE_F32) {
6597
- val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]);
6598
- } else if (tensor->type == GGML_TYPE_F16) {
6599
- val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + i1*tensor->nb[1] + i0*tensor->nb[0]));
6600
- }
6601
- if (std::isnan(val)) {
6602
- std::cerr << "ERROR: TENSOR CHECK " << name << ": Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " val=" << val << std::endl;
6603
- std::cerr << "tensor=" << tensor << " tensor->type=" << ggml_type_name(tensor->type) << " tensor->backend: " << tensor->backend << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6604
- std::cerr << std::endl;
6605
- ggml_vk_print_tensor_area(tensor, tensor->data, i0, i1, i2, i3);
6606
- std::cerr << std::endl;
6607
- std::vector<const ggml_tensor *> done;
6608
- ggml_vk_print_graph_origin(tensor, done);
6609
- GGML_ASSERT(false);
6610
- }
6611
- }
6612
- }
6613
- }
6614
- }
6615
- }
6616
-
6617
  void * comp_result;
6618
  size_t comp_size;
6619
  size_t comp_nb[GGML_MAX_DIMS];
@@ -6637,6 +6660,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6637
 
6638
  ggml_tensor * src0 = tensor->src[0];
6639
  ggml_tensor * src1 = tensor->src[1];
 
6640
 
6641
  struct ggml_init_params iparams = {
6642
  /*.mem_size =*/ 1024*1024*1024,
@@ -6666,10 +6690,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6666
 
6667
  src0_buffer = malloc(src0_size);
6668
  src0_clone->data = src0_buffer;
6669
- if (src0->backend == GGML_BACKEND_TYPE_CPU) {
6670
  memcpy(src0_clone->data, src0->data, src0_size);
6671
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6672
- } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
6673
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6674
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6675
  uint64_t offset = extra->offset;
@@ -6700,8 +6724,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6700
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6701
  ggml_vk_print_tensor(ctx, src0, "src0");
6702
  }
6703
-
6704
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src0", src0_clone);
6705
  }
6706
  if (src1 != nullptr) {
6707
  src1_clone = ggml_dup_tensor(ggml_ctx, src1);
@@ -6710,10 +6732,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6710
 
6711
  src1_buffer = malloc(src1_size);
6712
  src1_clone->data = src1_buffer;
6713
- if (src1->backend == GGML_BACKEND_TYPE_CPU) {
6714
  memcpy(src1_clone->data, src1->data, src1_size);
6715
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6716
- } else if (src1->backend == GGML_BACKEND_TYPE_GPU) {
6717
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6718
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6719
  uint64_t offset = extra->offset;
@@ -6744,12 +6766,12 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6744
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6745
  ggml_vk_print_tensor(ctx, src1, "src1");
6746
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6747
- std::cerr << "src1_clone=" << tensor << " src1_clone->backend: " << src1_clone->backend << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6748
  if (src1->src[0] != nullptr) {
6749
- std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " backend=" << src1->src[0]->backend << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6750
  }
6751
  if (src1->src[1] != nullptr) {
6752
- std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " backend=" << src1->src[1]->backend << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6753
  }
6754
  std::cerr << std::endl << "Result:" << std::endl;
6755
  ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
@@ -6760,8 +6782,64 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6760
  std::vector<const ggml_tensor *> done;
6761
  ggml_vk_print_graph_origin(src1_clone, done);
6762
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6763
 
6764
- ggml_vk_check_tensor(std::string(ggml_op_name(tensor->op)) + "->src1", src1_clone);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6765
  }
6766
 
6767
  if (tensor->op == GGML_OP_MUL_MAT) {
@@ -6799,7 +6877,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6799
  float attn_factor = ((float *) tensor->op_params)[8];
6800
  float beta_fast = ((float *) tensor->op_params)[9];
6801
  float beta_slow = ((float *) tensor->op_params)[10];
6802
- tensor_clone = ggml_rope_custom(ggml_ctx, src0_clone, src1_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6803
  } else if (tensor->op == GGML_OP_UNARY) {
6804
  switch (ggml_get_unary_op(tensor)) {
6805
  case GGML_UNARY_OP_SILU:
@@ -6847,7 +6925,6 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
6847
 
6848
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6849
 
6850
- ggml_vk_check_tensor(ggml_op_name(tensor->op), tensor_clone);
6851
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6852
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
6853
  }
@@ -6888,7 +6965,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6888
 
6889
  void * tensor_data = tensor->data;
6890
 
6891
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
6892
  size_t tensor_size = ggml_nbytes(tensor);
6893
  tensor_data = malloc(tensor_size);
6894
 
@@ -6936,12 +7013,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6936
 
6937
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
6938
  std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
6939
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
6940
  if (src0 != nullptr) {
6941
- std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
6942
  }
6943
  if (src1 != nullptr) {
6944
- std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
6945
  }
6946
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
6947
  std::cerr << std::endl << "Result:" << std::endl;
@@ -6977,12 +7054,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
6977
 
6978
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6979
  std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
6980
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
6981
  if (src0 != nullptr) {
6982
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
6983
  }
6984
  if (src1 != nullptr) {
6985
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
6986
  }
6987
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
6988
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7001,12 +7078,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7001
 
7002
  if (avg_err > 0.05 || std::isnan(avg_err)) {
7003
  std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7004
- std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->backend: " << tensor->backend << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7005
  if (src0 != nullptr) {
7006
- std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " backend=" << src0->backend << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7007
  }
7008
  if (src1 != nullptr) {
7009
- std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " backend=" << src1->backend << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7010
  }
7011
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7012
  std::cerr << std::endl << "Result:" << std::endl;
@@ -7018,14 +7095,14 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
7018
  ggml_vk_print_graph_origin(tensor, done);
7019
  GGML_ASSERT(false);
7020
  } else {
7021
- std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " backend=" << tensor->backend << " avg_err=" << avg_err << std::endl;
7022
  }
7023
 
7024
  free(comp_result);
7025
  comp_result = nullptr;
7026
  comp_size = 0;
7027
 
7028
- if (tensor->backend == GGML_BACKEND_TYPE_GPU) {
7029
  free(tensor_data);
7030
  }
7031
  }
 
290
  float corr_dims[4];
291
  float theta_scale;
292
  float inv_ndims;
293
+ uint32_t has_freq_facs;
294
  };
295
 
296
  struct vk_op_soft_max_push_constants {
 
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1525
 
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1528
 
1529
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1530
  }
 
3733
  }
3734
 
3735
 
3736
+ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) {
3737
  switch (op) {
3738
  case GGML_OP_ADD:
3739
  if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
 
3854
  default:
3855
  return nullptr;
3856
  }
3857
+
3858
+ GGML_UNUSED(src2);
3859
  }
3860
 
3861
  static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
 
3883
  }
3884
 
3885
  template<typename PC>
3886
+ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) {
3887
  #ifdef GGML_VULKAN_DEBUG
3888
  std::cerr << "ggml_vk_op_f32((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3];
3889
  if (src1 != nullptr) {
3890
  std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3];
3891
  }
3892
+ if (src2 != nullptr) {
3893
+ std::cerr << "), (" << src2 << ", name=" << src2->name << ", type=" << src2->type << ", ne0=" << src2->ne[0] << ", ne1=" << src2->ne[1] << ", ne2=" << src2->ne[2] << ", ne3=" << src2->ne[3] << ", nb0=" << src2->nb[0] << ", nb1=" << src2->nb[1] << ", nb2=" << src2->nb[2] << ", nb3=" << src2->nb[3];
3894
+ }
3895
  std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl;
3896
  #endif
3897
  GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT
 
3902
  const uint64_t ne02 = src0->ne[2];
3903
  const uint64_t ne03 = src0->ne[3];
3904
  const uint64_t ne0 = ne00 * ne01;
3905
+
3906
  const bool use_src1 = src1 != nullptr;
3907
  const uint64_t ne10 = use_src1 ? src1->ne[0] : 0;
3908
  const uint64_t ne11 = use_src1 ? src1->ne[1] : 0;
 
3911
  const uint64_t ne1 = ne10 * ne11;
3912
  // const uint64_t nb10 = use_src1 ? src1->nb[0] : 0;
3913
 
3914
+ const bool use_src2 = src2 != nullptr;
3915
+ const uint64_t ne20 = use_src2 ? src2->ne[0] : 0;
3916
+ const uint64_t ne21 = use_src2 ? src2->ne[1] : 0;
3917
+ const uint64_t ne22 = use_src2 ? src2->ne[2] : 0;
3918
+ const uint64_t ne23 = use_src2 ? src2->ne[3] : 0;
3919
+ const uint64_t ne2 = ne20 * ne21;
3920
+
3921
+ vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, dst, op);
3922
  ggml_vk_func_t op_func;
3923
 
3924
  if (pipeline == nullptr) {
 
3941
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
3942
  ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
3943
  ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
3944
+ ggml_tensor_extra_gpu * extra_src2 = use_src2 ? (ggml_tensor_extra_gpu *) src2->extra : nullptr;
3945
 
3946
  vk_buffer d_X = nullptr;
3947
  size_t x_buf_offset = 0;
3948
  vk_buffer d_Y = nullptr;
3949
  size_t y_buf_offset = 0;
3950
  vk_buffer d_Z = nullptr;
3951
+ size_t z_buf_offset = 0;
3952
 
3953
  bool src0_uma = false;
3954
  bool src1_uma = false;
3955
+ bool src2_uma = false;
3956
 
3957
  if (ctx->device->uma) {
3958
  ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
 
3961
  ggml_vk_host_get(ctx, src1->data, d_Y, y_buf_offset);
3962
  src1_uma = d_Y != nullptr;
3963
  }
3964
+ if (use_src2) {
3965
+ ggml_vk_host_get(ctx, src2->data, d_Z, z_buf_offset);
3966
+ src2_uma = d_Z != nullptr;
3967
+ }
3968
  }
3969
 
3970
  uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
3971
  uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3972
+ uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
3973
  uint64_t d_sz = ggml_type_size(dst->type) * ne0;
3974
 
3975
  vk_buffer d_D = extra->buffer_gpu.lock();
 
3992
  y_buf_offset = extra_src1->offset;
3993
  GGML_ASSERT(d_Y != nullptr);
3994
  }
3995
+ if (use_src2 && !src2_uma) {
3996
+ d_Z = extra_src2->buffer_gpu.lock();
3997
+ z_buf_offset = extra_src2->offset;
3998
+ GGML_ASSERT(d_Z != nullptr);
3999
+ }
4000
 
4001
  if (op_supports_incontiguous) {
4002
  x_sz = ggml_nbytes(src0);
4003
  y_sz = use_src1 ? ggml_nbytes(src1) : 0;
4004
+ z_sz = use_src2 ? ggml_nbytes(src2) : 0;
4005
  d_sz = ggml_nbytes(dst);
4006
 
4007
  if (x_buf_offset + x_sz >= d_X->size) {
 
4010
  if (use_src1 && y_buf_offset + y_sz >= d_Y->size) {
4011
  y_sz = VK_WHOLE_SIZE;
4012
  }
4013
+ if (use_src2 && z_buf_offset + z_sz >= d_Z->size) {
4014
+ z_sz = VK_WHOLE_SIZE;
4015
+ }
4016
  if (d_buf_offset + d_sz >= d_D->size) {
4017
  d_sz = VK_WHOLE_SIZE;
4018
  }
 
4052
  if (use_src1 && y_sz != VK_WHOLE_SIZE) {
4053
  y_sz *= ne12 * ne13;
4054
  }
4055
+ if (use_src2 && z_sz != VK_WHOLE_SIZE) {
4056
+ z_sz *= ne22 * ne23;
4057
+ }
4058
  if (d_sz != VK_WHOLE_SIZE) {
4059
  d_sz *= ne02 * ne03;
4060
  }
4061
  }
4062
 
4063
  if (op == GGML_OP_SOFT_MAX) {
4064
+ // Empty src1 is possible in soft_max, but the shader needs a buffer
4065
  vk_subbuffer subbuf_y;
4066
  if (use_src1) {
4067
  subbuf_y = { d_Y, y_buf_offset, y_sz };
 
4071
 
4072
  ggml_vk_sync_buffers(subctx);
4073
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4074
+ } else if (op == GGML_OP_ROPE) {
4075
+ const int mode = ((int32_t *) dst->op_params)[2];
4076
+ const bool is_neox = mode & 2;
4077
+
4078
+ if (is_neox) {
4079
+ // Empty src2 is possible in rope, but the shader needs a buffer
4080
+ vk_subbuffer subbuf_z;
4081
+ if (use_src2) {
4082
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
4083
+ } else {
4084
+ subbuf_z = { d_X, 0, d_X->size };
4085
+ }
4086
+
4087
+ ggml_vk_sync_buffers(subctx);
4088
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4089
+ } else {
4090
+ ggml_vk_sync_buffers(subctx);
4091
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4092
+ }
4093
+ } else if (use_src2) {
4094
+ ggml_vk_sync_buffers(subctx);
4095
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4096
  } else if (use_src1) {
4097
  ggml_vk_sync_buffers(subctx);
4098
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
 
4103
  } else {
4104
  GGML_ASSERT(op != GGML_OP_SOFT_MAX);
4105
  GGML_ASSERT(op != GGML_OP_ARGSORT);
4106
+ GGML_ASSERT(!use_src2);
4107
 
4108
  ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
4109
 
 
4145
  }
4146
 
4147
  static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
4148
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_REPEAT, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f });
4149
  }
4150
 
4151
  static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4153
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4154
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4155
 
4156
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, {
4157
  (uint32_t)ggml_nelements(src0),
4158
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4159
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
 
4168
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4169
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4170
 
4171
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ADD, {
4172
  (uint32_t)ggml_nelements(src0),
4173
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4174
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
 
4183
  const uint32_t src1_type_size = ggml_type_size(src1->type);
4184
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4185
 
4186
+ ggml_vk_op_f32<vk_op_binary_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_MUL, {
4187
  (uint32_t)ggml_nelements(src0),
4188
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4189
  (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size,
 
4198
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4199
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4200
 
4201
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
4202
  (uint32_t)ggml_nelements(src0),
4203
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4204
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
 
4211
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4212
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4213
 
4214
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
4215
  (uint32_t)ggml_nelements(src0),
4216
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4217
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
 
4225
  const uint32_t src0_type_size = ggml_type_size(src0->type);
4226
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4227
 
4228
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
4229
  (uint32_t)ggml_nelements(src0),
4230
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4231
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
 
4240
  const uint32_t dst_type_size = ggml_type_size(dst->type);
4241
  const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
4242
 
4243
+ ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
4244
  (uint32_t)ggml_nelements(src0),
4245
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
4246
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
 
4252
  static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4253
  float * op_params = (float *)dst->op_params;
4254
 
4255
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4256
  }
4257
 
4258
  static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4259
  float * op_params = (float *)dst->op_params;
4260
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f });
4261
  }
4262
 
4263
  static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4264
+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f });
4265
  }
4266
 
4267
  static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
4268
  int32_t * op_params = (int32_t *)dst->op_params;
4269
+ ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] });
4270
  }
4271
 
4272
  static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
4285
  const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
4286
  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
4287
 
4288
+ ggml_vk_op_f32<vk_op_soft_max_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SOFT_MAX, {
4289
  ncols,
4290
  src1 != nullptr ? nrows_y : (uint32_t)0,
4291
  scale, max_bias,
 
4294
  });
4295
  }
4296
 
4297
+ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
 
 
 
 
4298
  const int n_dims = ((int32_t *) dst->op_params)[1];
4299
  const int mode = ((int32_t *) dst->op_params)[2];
4300
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
 
4317
  if (is_neox) {
4318
  const float theta_scale = powf(freq_base, -2.0f/n_dims);
4319
  const float inv_ndims = -1.0f / n_dims;
4320
+ ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4321
  (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4322
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4323
+ src2 != nullptr,
4324
  });
4325
  } else {
4326
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4327
  (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4328
  freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4329
  });
 
4346
 
4347
  std::cerr << ((ggml_sort_order) op_params[0]) << " " << GGML_SORT_ORDER_ASC << std::endl;
4348
 
4349
+ ggml_vk_op_f32<vk_op_argsort_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, {
4350
  ncols,
4351
  ncols_pad,
4352
  op_params[0],
 
5462
 
5463
  const ggml_tensor * src0 = node->src[0];
5464
  const ggml_tensor * src1 = node->src[1];
5465
+ const ggml_tensor * src2 = node->src[2];
5466
 
5467
  switch (node->op) {
5468
  case GGML_OP_UNARY:
 
5579
 
5580
  break;
5581
  case GGML_OP_ROPE:
5582
+ ggml_vk_rope(ctx, ctx->compute_ctx, src0, src1, src2, node);
5583
 
5584
  break;
5585
  case GGML_OP_ARGSORT:
 
6555
  for (int j = 0; j < level; j++) {
6556
  std::cerr << " ";
6557
  }
6558
+ std::cerr << ggml_op_name(tensor->op) << " gpu=" << (tensor->extra != nullptr) << std::endl;
6559
 
6560
  done.push_back(tensor);
6561
 
 
6605
  static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) {
6606
  void * tensor_data = tensor->data;
6607
 
6608
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6609
  const size_t tensor_size = ggml_nbytes(tensor);
6610
  tensor_data = malloc(tensor_size);
6611
 
 
6616
  }
6617
 
6618
  std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
6619
+ std::cerr << "tensor=" << tensor << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << std::endl;
6620
  if (tensor->src[0] != nullptr) {
6621
+ std::cerr << "tensor->src[0]=" << tensor->src[0] << " name=" << tensor->src[0]->name << " op=" << ggml_op_name(tensor->src[0]->op) << " type=" << ggml_type_name(tensor->src[0]->type) << " ne0=" << tensor->src[0]->ne[0] << " nb0=" << tensor->src[0]->nb[0] << " ne1=" << tensor->src[0]->ne[1] << " nb1=" << tensor->src[0]->nb[1] << " ne2=" << tensor->src[0]->ne[2] << " nb2=" << tensor->src[0]->nb[2] << " ne3=" << tensor->src[0]->ne[3] << " nb3=" << tensor->src[0]->nb[3] << std::endl;
6622
  }
6623
  if (tensor->src[1] != nullptr) {
6624
+ std::cerr << "tensor->src[1]=" << tensor->src[1] << " name=" << tensor->src[1]->name << " op=" << ggml_op_name(tensor->src[1]->op) << " type=" << ggml_type_name(tensor->src[1]->type) << " ne0=" << tensor->src[1]->ne[0] << " nb0=" << tensor->src[1]->nb[0] << " ne1=" << tensor->src[1]->ne[1] << " nb1=" << tensor->src[1]->nb[1] << " ne2=" << tensor->src[1]->ne[2] << " nb2=" << tensor->src[1]->nb[2] << " ne3=" << tensor->src[1]->ne[3] << " nb3=" << tensor->src[1]->nb[3] << std::endl;
6625
  }
6626
  std::cerr << std::endl << "Result:" << std::endl;
6627
  ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
 
6632
  std::vector<const ggml_tensor *> done;
6633
  ggml_vk_print_graph_origin(tensor, done);
6634
 
6635
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6636
  free(tensor_data);
6637
  }
6638
  }
6639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6640
  void * comp_result;
6641
  size_t comp_size;
6642
  size_t comp_nb[GGML_MAX_DIMS];
 
6660
 
6661
  ggml_tensor * src0 = tensor->src[0];
6662
  ggml_tensor * src1 = tensor->src[1];
6663
+ ggml_tensor * src2 = tensor->src[2];
6664
 
6665
  struct ggml_init_params iparams = {
6666
  /*.mem_size =*/ 1024*1024*1024,
 
6690
 
6691
  src0_buffer = malloc(src0_size);
6692
  src0_clone->data = src0_buffer;
6693
+ if (ggml_backend_buffer_is_host(src0->buffer)) {
6694
  memcpy(src0_clone->data, src0->data, src0_size);
6695
  memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
6696
+ } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
6697
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
6698
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6699
  uint64_t offset = extra->offset;
 
6724
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6725
  ggml_vk_print_tensor(ctx, src0, "src0");
6726
  }
 
 
6727
  }
6728
  if (src1 != nullptr) {
6729
  src1_clone = ggml_dup_tensor(ggml_ctx, src1);
 
6732
 
6733
  src1_buffer = malloc(src1_size);
6734
  src1_clone->data = src1_buffer;
6735
+ if (ggml_backend_buffer_is_host(src1->buffer)) {
6736
  memcpy(src1_clone->data, src1->data, src1_size);
6737
  memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
6738
+ } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
6739
  ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
6740
  vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6741
  uint64_t offset = extra->offset;
 
6766
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6767
  ggml_vk_print_tensor(ctx, src1, "src1");
6768
  std::cerr << "TENSOR CHECK: " << ggml_op_name(src1_clone->op) << " (check " << check_counter << ")" << std::endl;
6769
+ std::cerr << "src1_clone=" << tensor << " src1_clone->type: " << ggml_type_name(src1_clone->type) << " ne0=" << src1_clone->ne[0] << " nb0=" << src1_clone->nb[0] << " ne1=" << src1_clone->ne[1] << " nb1=" << src1_clone->nb[1] << " ne2=" << src1_clone->ne[2] << " nb2=" << src1_clone->nb[2] << " ne3=" << src1_clone->ne[3] << " nb3=" << src1_clone->nb[3] << std::endl;
6770
  if (src1->src[0] != nullptr) {
6771
+ std::cerr << "src1->src[0]=" << src1->src[0] << " op=" << ggml_op_name(src1->src[0]->op) << " type=" << ggml_type_name(src1->src[0]->type) << " ne0=" << src1->src[0]->ne[0] << " nb0=" << src1->src[0]->nb[0] << " ne1=" << src1->src[0]->ne[1] << " nb1=" << src1->src[0]->nb[1] << " ne2=" << src1->src[0]->ne[2] << " nb2=" << src1->src[0]->nb[2] << " ne3=" << src1->src[0]->ne[3] << " nb3=" << src1->src[0]->nb[3] << std::endl;
6772
  }
6773
  if (src1->src[1] != nullptr) {
6774
+ std::cerr << "src1->src[1]=" << src1->src[1] << " op=" << ggml_op_name(src1->src[1]->op) << " type=" << ggml_type_name(src1->src[1]->type) << " ne0=" << src1->src[1]->ne[0] << " nb0=" << src1->src[1]->nb[0] << " ne1=" << src1->src[1]->ne[1] << " nb1=" << src1->src[1]->nb[1] << " ne2=" << src1->src[1]->ne[2] << " nb2=" << src1->src[1]->nb[2] << " ne3=" << src1->src[1]->ne[3] << " nb3=" << src1->src[1]->nb[3] << std::endl;
6775
  }
6776
  std::cerr << std::endl << "Result:" << std::endl;
6777
  ggml_vk_print_tensor_area(src1_clone, src1_clone->data, 5, 5, 0, 0);
 
6782
  std::vector<const ggml_tensor *> done;
6783
  ggml_vk_print_graph_origin(src1_clone, done);
6784
  }
6785
+ }
6786
+ if (src2 != nullptr) {
6787
+ src2_clone = ggml_dup_tensor(ggml_ctx, src2);
6788
+
6789
+ src2_size = ggml_nbytes(src2);
6790
+
6791
+ src2_buffer = malloc(src2_size);
6792
+ src2_clone->data = src2_buffer;
6793
+ if (ggml_backend_buffer_is_host(src2->buffer)) {
6794
+ memcpy(src2_clone->data, src2->data, src2_size);
6795
+ memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6796
+ } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
6797
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
6798
+ vk_buffer buffer_gpu = extra->buffer_gpu.lock();
6799
+ uint64_t offset = extra->offset;
6800
+ if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
6801
+ for (int i3 = 0; i3 < src2->ne[3]; i3++) {
6802
+ for (int i2 = 0; i2 < src2->ne[2]; i2++) {
6803
+ const int idx = i3*src2->ne[2] + i2;
6804
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src2->nb[2], ((char *)src2_clone->data + idx * src2_clone->nb[2]), src2->ne[1] * src2->nb[1]);
6805
+ }
6806
+ }
6807
+
6808
+ src2_clone->nb[0] = src2->nb[0];
6809
+ src2_clone->nb[1] = src2->nb[1];
6810
+ for (int i = 2; i < GGML_MAX_DIMS; i++) {
6811
+ src2_clone->nb[i] = src2_clone->nb[i - 1]*src2_clone->ne[i - 1];
6812
+ }
6813
+ } else {
6814
+ if (offset + src2_size >= buffer_gpu->size) {
6815
+ src2_size = buffer_gpu->size - offset;
6816
+ }
6817
+ ggml_vk_buffer_read(ctx, buffer_gpu, offset, src2_clone->data, src2_size);
6818
+ memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
6819
+ }
6820
+ } else {
6821
+ GGML_ASSERT(false);
6822
+ }
6823
 
6824
+ if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6825
+ ggml_vk_print_tensor(ctx, src2, "src2");
6826
+ std::cerr << "TENSOR CHECK: " << ggml_op_name(src2_clone->op) << " (check " << check_counter << ")" << std::endl;
6827
+ std::cerr << "src2_clone=" << tensor << " src2_clone->type: " << ggml_type_name(src2_clone->type) << " ne0=" << src2_clone->ne[0] << " nb0=" << src2_clone->nb[0] << " ne1=" << src2_clone->ne[1] << " nb1=" << src2_clone->nb[1] << " ne2=" << src2_clone->ne[2] << " nb2=" << src2_clone->nb[2] << " ne3=" << src2_clone->ne[3] << " nb3=" << src2_clone->nb[3] << std::endl;
6828
+ if (src2->src[0] != nullptr) {
6829
+ std::cerr << "src2->src[0]=" << src2->src[0] << " op=" << ggml_op_name(src2->src[0]->op) << " type=" << ggml_type_name(src2->src[0]->type) << " ne0=" << src2->src[0]->ne[0] << " nb0=" << src2->src[0]->nb[0] << " ne1=" << src2->src[0]->ne[1] << " nb1=" << src2->src[0]->nb[1] << " ne2=" << src2->src[0]->ne[2] << " nb2=" << src2->src[0]->nb[2] << " ne3=" << src2->src[0]->ne[3] << " nb3=" << src2->src[0]->nb[3] << std::endl;
6830
+ }
6831
+ if (src2->src[1] != nullptr) {
6832
+ std::cerr << "src2->src[1]=" << src2->src[1] << " op=" << ggml_op_name(src2->src[1]->op) << " type=" << ggml_type_name(src2->src[1]->type) << " ne0=" << src2->src[1]->ne[0] << " nb0=" << src2->src[1]->nb[0] << " ne1=" << src2->src[1]->ne[1] << " nb1=" << src2->src[1]->nb[1] << " ne2=" << src2->src[1]->ne[2] << " nb2=" << src2->src[1]->nb[2] << " ne3=" << src2->src[1]->ne[3] << " nb3=" << src2->src[1]->nb[3] << std::endl;
6833
+ }
6834
+ std::cerr << std::endl << "Result:" << std::endl;
6835
+ ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 0, 0);
6836
+ std::cerr << std::endl;
6837
+ std::cerr << std::endl << "Result:" << std::endl;
6838
+ ggml_vk_print_tensor_area(src2_clone, src2_clone->data, 5, 5, 1, 0);
6839
+ std::cerr << std::endl;
6840
+ std::vector<const ggml_tensor *> done;
6841
+ ggml_vk_print_graph_origin(src2_clone, done);
6842
+ }
6843
  }
6844
 
6845
  if (tensor->op == GGML_OP_MUL_MAT) {
 
6877
  float attn_factor = ((float *) tensor->op_params)[8];
6878
  float beta_fast = ((float *) tensor->op_params)[9];
6879
  float beta_slow = ((float *) tensor->op_params)[10];
6880
+ tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
6881
  } else if (tensor->op == GGML_OP_UNARY) {
6882
  switch (ggml_get_unary_op(tensor)) {
6883
  case GGML_UNARY_OP_SILU:
 
6925
 
6926
  ggml_graph_compute_with_ctx(ggml_ctx, cgraph, 8);
6927
 
 
6928
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
6929
  ggml_vk_print_tensor(ctx, tensor_clone, "tensor_clone");
6930
  }
 
6965
 
6966
  void * tensor_data = tensor->data;
6967
 
6968
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
6969
  size_t tensor_size = ggml_nbytes(tensor);
6970
  tensor_data = malloc(tensor_size);
6971
 
 
7013
 
7014
  if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
7015
  std::cerr << "ERROR: Invalid value in " << ggml_op_name(tensor->op) << " i3=" << i3 << " i2=" << i2 << " i1=" << i1 << " i0=" << i0 << " result=" << result << " correct=" << correct << " avg_err=" << (avg_err / counter) << std::endl;
7016
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7017
  if (src0 != nullptr) {
7018
+ std::cerr << "src0=" << src0 << " src0->name=" << src0->name << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7019
  }
7020
  if (src1 != nullptr) {
7021
+ std::cerr << "src1=" << src1 << " src1->name=" << src1->name << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7022
  }
7023
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7024
  std::cerr << std::endl << "Result:" << std::endl;
 
7054
 
7055
  if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
7056
  std::cerr << "TENSOR CHECK: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7057
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7058
  if (src0 != nullptr) {
7059
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7060
  }
7061
  if (src1 != nullptr) {
7062
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7063
  }
7064
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7065
  std::cerr << std::endl << "Result:" << std::endl;
 
7078
 
7079
  if (avg_err > 0.05 || std::isnan(avg_err)) {
7080
  std::cerr << "ERROR: avg_err=" << avg_err << " in " << ggml_op_name(tensor->op) << " (check " << check_counter << ")" << std::endl;
7081
+ std::cerr << "tensor=" << tensor << " tensor->name=" << tensor->name << " tensor->type: " << ggml_type_name(tensor->type) << " ne0=" << tensor->ne[0] << " nb0=" << tensor->nb[0] << " ne1=" << tensor->ne[1] << " nb1=" << tensor->nb[1] << " ne2=" << tensor->ne[2] << " nb2=" << tensor->nb[2] << " ne3=" << tensor->ne[3] << " nb3=" << tensor->nb[3] << " offset=" << tensor->view_offs << std::endl;
7082
  if (src0 != nullptr) {
7083
+ std::cerr << "src0=" << src0 << " op=" << ggml_op_name(src0->op) << " type=" << ggml_type_name(src0->type) << " ne0=" << src0->ne[0] << " nb0=" << src0->nb[0] << " ne1=" << src0->ne[1] << " nb1=" << src0->nb[1] << " ne2=" << src0->ne[2] << " nb2=" << src0->nb[2] << " ne3=" << src0->ne[3] << " nb3=" << src0->nb[3] << " offset=" << src0->view_offs << std::endl;
7084
  }
7085
  if (src1 != nullptr) {
7086
+ std::cerr << "src1=" << src1 << " op=" << ggml_op_name(src1->op) << " type=" << ggml_type_name(src1->type) << " ne0=" << src1->ne[0] << " nb0=" << src1->nb[0] << " ne1=" << src1->ne[1] << " nb1=" << src1->nb[1] << " ne2=" << src1->ne[2] << " nb2=" << src1->nb[2] << " ne3=" << src1->ne[3] << " nb3=" << src1->nb[3] << " offset=" << src1->view_offs << std::endl;
7087
  }
7088
  std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
7089
  std::cerr << std::endl << "Result:" << std::endl;
 
7095
  ggml_vk_print_graph_origin(tensor, done);
7096
  GGML_ASSERT(false);
7097
  } else {
7098
+ std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
7099
  }
7100
 
7101
  free(comp_result);
7102
  comp_result = nullptr;
7103
  comp_size = 0;
7104
 
7105
+ if (ggml_backend_buffer_is_vk(tensor->buffer)) {
7106
  free(tensor_data);
7107
  }
7108
  }