OccamRazor slaren commited on
Commit
71850e7
·
1 Parent(s): 154bf2b

Update Vulkan RoPE implementation (llama/7818)

Browse files

* Update Vulkan RoPE implementation

* Return nullptr on alloc_buffer when allocation fails, instead of throwing an exception

Minor fixes

* Fix segfault when running out of VRAM

Co-authored-by: slaren <[email protected]>

---------

Co-authored-by: slaren <[email protected]>

Files changed (2) hide show
  1. ggml-alloc.c +1 -1
  2. ggml-vulkan.cpp +34 -59
ggml-alloc.c CHANGED
@@ -886,7 +886,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
886
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
887
  #endif
888
  for (size_t i = 0; i < *n_buffers; i++) {
889
- ggml_backend_buffer_free(*buffers[i]);
890
  }
891
  free(*buffers);
892
  return false;
 
886
  fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
887
  #endif
888
  for (size_t i = 0; i < *n_buffers; i++) {
889
+ ggml_backend_buffer_free((*buffers)[i]);
890
  }
891
  free(*buffers);
892
  return false;
ggml-vulkan.cpp CHANGED
@@ -150,7 +150,7 @@ struct vk_device {
150
  vk_pipeline pipeline_relu_f32;
151
  vk_pipeline pipeline_diag_mask_inf_f32;
152
  vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
153
- vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
154
  vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
155
  vk_pipeline pipeline_argsort_f32;
156
  vk_pipeline pipeline_sum_rows_f32;
@@ -283,26 +283,15 @@ struct vk_op_diag_mask_push_constants {
283
 
284
  struct vk_op_rope_push_constants {
285
  uint32_t ncols;
 
286
  float freq_scale;
287
  uint32_t p_delta_rows;
288
  float freq_base;
289
  float ext_factor;
290
  float attn_factor;
291
- float corr_dims[4];
292
- };
293
-
294
- struct vk_op_rope_neox_push_constants {
295
- uint32_t ncols;
296
- uint32_t ndims;
297
- float freq_scale;
298
- uint32_t p_delta_rows;
299
- float freq_base;
300
- float ext_factor;
301
- float attn_factor;
302
- float corr_dims[4];
303
  float theta_scale;
304
- float inv_ndims;
305
- uint32_t has_freq_facs;
306
  };
307
 
308
  struct vk_op_soft_max_push_constants {
@@ -1534,11 +1523,11 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
1534
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1535
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1536
 
1537
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1538
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1539
 
1540
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1541
- ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
1542
 
1543
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1544
 
@@ -3905,10 +3894,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
3905
  }
3906
  } else {
3907
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3908
- return ctx->device->pipeline_rope_f32;
3909
  }
3910
  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3911
- return ctx->device->pipeline_rope_f16;
3912
  }
3913
  }
3914
  return nullptr;
@@ -4152,24 +4141,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
4152
  ggml_vk_sync_buffers(subctx);
4153
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4154
  } else if (op == GGML_OP_ROPE) {
4155
- const int mode = ((int32_t *) dst->op_params)[2];
4156
- const bool is_neox = mode & 2;
4157
-
4158
- if (is_neox) {
4159
- // Empty src2 is possible in rope, but the shader needs a buffer
4160
- vk_subbuffer subbuf_z;
4161
- if (use_src2) {
4162
- subbuf_z = { d_Z, z_buf_offset, z_sz };
4163
- } else {
4164
- subbuf_z = { d_X, 0, d_X->size };
4165
- }
4166
-
4167
- ggml_vk_sync_buffers(subctx);
4168
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4169
  } else {
4170
- ggml_vk_sync_buffers(subctx);
4171
- ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4172
  }
 
 
 
4173
  } else if (use_src2) {
4174
  ggml_vk_sync_buffers(subctx);
4175
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
@@ -4391,7 +4372,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx,
4391
 
4392
  static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4393
  const int n_dims = ((int32_t *) dst->op_params)[1];
4394
- const int mode = ((int32_t *) dst->op_params)[2];
4395
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
4396
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
4397
  const float freq_base = ((float *) dst->op_params)[5];
@@ -4401,28 +4382,16 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
4401
  const float beta_fast = ((float *) dst->op_params)[9];
4402
  const float beta_slow = ((float *) dst->op_params)[10];
4403
 
4404
- const bool is_neox = mode & 2;
4405
-
4406
- #pragma message("TODO: update rope NORM mode to match NEOX mode")
4407
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
4408
-
4409
  float corr_dims[2];
4410
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
4411
 
4412
- if (is_neox) {
4413
- const float theta_scale = powf(freq_base, -2.0f/n_dims);
4414
- const float inv_ndims = -1.0f / n_dims;
4415
- ggml_vk_op_f32<vk_op_rope_neox_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4416
- (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4417
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims,
4418
- src2 != nullptr,
4419
- });
4420
- } else {
4421
- ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4422
- (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1],
4423
- freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}
4424
- });
4425
- }
4426
  }
4427
 
4428
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
@@ -6070,7 +6039,13 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
6070
  std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6071
  #endif
6072
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6073
- vk_buffer dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
 
 
 
 
 
 
6074
 
6075
  ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
6076
 
@@ -6466,7 +6441,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
6466
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6467
  // } break;
6468
  case GGML_OP_ROPE:
6469
- return true;
6470
  case GGML_OP_NONE:
6471
  case GGML_OP_RESHAPE:
6472
  case GGML_OP_VIEW:
 
150
  vk_pipeline pipeline_relu_f32;
151
  vk_pipeline pipeline_diag_mask_inf_f32;
152
  vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
153
+ vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16;
154
  vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
155
  vk_pipeline pipeline_argsort_f32;
156
  vk_pipeline pipeline_sum_rows_f32;
 
283
 
284
  struct vk_op_rope_push_constants {
285
  uint32_t ncols;
286
+ uint32_t n_dims;
287
  float freq_scale;
288
  uint32_t p_delta_rows;
289
  float freq_base;
290
  float ext_factor;
291
  float attn_factor;
292
+ float corr_dims[2];
 
 
 
 
 
 
 
 
 
 
 
293
  float theta_scale;
294
+ uint32_t has_ff;
 
295
  };
296
 
297
  struct vk_op_soft_max_push_constants {
 
1523
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1524
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 3, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1);
1525
 
1526
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f32, "rope_norm_f32", rope_norm_f32_len, rope_norm_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1527
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_norm_f16, "rope_norm_f16", rope_norm_f16_len, rope_norm_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1528
 
1529
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1530
+ ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 4, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
1531
 
1532
  ggml_vk_create_pipeline(ctx, ctx->device->pipeline_argsort_f32, "argsort_f32", argsort_f32_len, argsort_f32_data, "main", 2, sizeof(vk_op_argsort_push_constants), {1024, 1, 1}, {}, 1);
1533
 
 
3894
  }
3895
  } else {
3896
  if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
3897
+ return ctx->device->pipeline_rope_norm_f32;
3898
  }
3899
  if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
3900
+ return ctx->device->pipeline_rope_norm_f16;
3901
  }
3902
  }
3903
  return nullptr;
 
4141
  ggml_vk_sync_buffers(subctx);
4142
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, subbuf_y, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4143
  } else if (op == GGML_OP_ROPE) {
4144
+ // Empty src2 is possible in rope, but the shader needs a buffer
4145
+ vk_subbuffer subbuf_z;
4146
+ if (use_src2) {
4147
+ subbuf_z = { d_Z, z_buf_offset, z_sz };
 
 
 
 
 
 
 
 
 
 
4148
  } else {
4149
+ subbuf_z = { d_X, 0, d_X->size };
 
4150
  }
4151
+
4152
+ ggml_vk_sync_buffers(subctx);
4153
+ ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, subbuf_z, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
4154
  } else if (use_src2) {
4155
  ggml_vk_sync_buffers(subctx);
4156
  ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_Z, z_buf_offset, z_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
 
4372
 
4373
  static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) {
4374
  const int n_dims = ((int32_t *) dst->op_params)[1];
4375
+ // const int mode = ((int32_t *) dst->op_params)[2];
4376
  // const int n_ctx = ((int32_t *) dst->op_params)[3];
4377
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
4378
  const float freq_base = ((float *) dst->op_params)[5];
 
4382
  const float beta_fast = ((float *) dst->op_params)[9];
4383
  const float beta_slow = ((float *) dst->op_params)[10];
4384
 
 
 
 
 
 
4385
  float corr_dims[2];
4386
  ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
4387
 
4388
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
4389
+
4390
+ ggml_vk_op_f32<vk_op_rope_push_constants>(ctx, subctx, src0, src1, src2, dst, GGML_OP_ROPE, {
4391
+ (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1],
4392
+ freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1]}, theta_scale,
4393
+ src2 != nullptr,
4394
+ });
 
 
 
 
 
 
 
4395
  }
4396
 
4397
  static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) {
 
6039
  std::cerr << "ggml_backend_vk_buffer_type_alloc_buffer(" << size << ")" << std::endl;
6040
  #endif
6041
  ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
6042
+
6043
+ vk_buffer dev_buffer = nullptr;
6044
+ try {
6045
+ dev_buffer = ggml_vk_create_buffer_device(ctx->ctx, size);
6046
+ } catch (const vk::SystemError& e) {
6047
+ return nullptr;
6048
+ }
6049
 
6050
  ggml_backend_vk_buffer_context * bufctx = new ggml_backend_vk_buffer_context(ctx->ctx, std::move(dev_buffer), ctx->name);
6051
 
 
6441
  // return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
6442
  // } break;
6443
  case GGML_OP_ROPE:
6444
+ return ggml_is_contiguous(op->src[0]);
6445
  case GGML_OP_NONE:
6446
  case GGML_OP_RESHAPE:
6447
  case GGML_OP_VIEW: