jeffbolznv commited on
Commit
f5f766b
·
1 Parent(s): 69679f5

vulkan: support CPY from any type to itself (llama/13695)

Browse files

Reuse the f16/f32 copy shaders, and just scale the number of elements
according to the type size.

ggml/src/ggml-vulkan/ggml-vulkan.cpp CHANGED
@@ -4676,6 +4676,19 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const
4676
  }
4677
  }
4678
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4679
  std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
4680
  GGML_ABORT("fatal error");
4681
  }
@@ -6737,7 +6750,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
6737
  case GGML_OP_UNARY:
6738
  case GGML_OP_CONV_2D_DW:
6739
  {
6740
- const uint32_t ne = ggml_nelements(dst);
 
 
 
 
 
 
 
 
 
6741
  if (ne > 262144) {
6742
  elements = { 512, 512, CEIL_DIV(ne, 262144) };
6743
  } else if (ne > 512) {
@@ -7287,8 +7309,19 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
7287
  const uint32_t src0_type_size = ggml_type_size(src0->type);
7288
  const uint32_t dst_type_size = ggml_type_size(dst->type);
7289
 
 
 
 
 
 
 
 
 
 
 
 
7290
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
7291
- (uint32_t)ggml_nelements(src0),
7292
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7293
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7294
  0,
@@ -9872,6 +9905,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
9872
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
9873
  return true;
9874
  }
 
 
 
 
 
 
 
 
 
9875
  return false;
9876
  } break;
9877
  case GGML_OP_REPEAT:
 
4676
  }
4677
  }
4678
 
4679
+ if (src->type == to) {
4680
+ // Copy two or four bytes at a time, depending on block size.
4681
+ // For quantized types, we scale by block size/type size. But
4682
+ // this path is also used for bf16->bf16 for example, where the
4683
+ // type size must be exactly 2 or 4.
4684
+ GGML_ASSERT(ggml_is_quantized(to) || ggml_type_size(src->type) == 2 || ggml_type_size(src->type) == 4);
4685
+ if ((ggml_type_size(src->type) % 4) == 0) {
4686
+ return ctx->device->pipeline_contig_cpy_f32_f32;
4687
+ } else {
4688
+ return ctx->device->pipeline_contig_cpy_f16_f16;
4689
+ }
4690
+ }
4691
+
4692
  std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl;
4693
  GGML_ABORT("fatal error");
4694
  }
 
6750
  case GGML_OP_UNARY:
6751
  case GGML_OP_CONV_2D_DW:
6752
  {
6753
+ uint32_t ne = ggml_nelements(dst);
6754
+ if (op == GGML_OP_CPY && ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
6755
+ // Convert from number of logical elements to 2- or 4-byte units.
6756
+ ne /= ggml_blck_size(src0->type);
6757
+ if ((ggml_type_size(src0->type) % 4) == 0) {
6758
+ ne *= ggml_type_size(src0->type) / 4;
6759
+ } else {
6760
+ ne *= ggml_type_size(src0->type) / 2;
6761
+ }
6762
+ }
6763
  if (ne > 262144) {
6764
  elements = { 512, 512, CEIL_DIV(ne, 262144) };
6765
  } else if (ne > 512) {
 
7309
  const uint32_t src0_type_size = ggml_type_size(src0->type);
7310
  const uint32_t dst_type_size = ggml_type_size(dst->type);
7311
 
7312
+ uint32_t ne = (uint32_t)ggml_nelements(src0);
7313
+ if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
7314
+ // Convert from number of logical elements to 2- or 4-byte units.
7315
+ ne /= ggml_blck_size(src0->type);
7316
+ if ((ggml_type_size(src0->type) % 4) == 0) {
7317
+ ne *= ggml_type_size(src0->type) / 4;
7318
+ } else {
7319
+ ne *= ggml_type_size(src0->type) / 2;
7320
+ }
7321
+ }
7322
+
7323
  ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
7324
+ ne,
7325
  (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
7326
  (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
7327
  0,
 
9905
  if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
9906
  return true;
9907
  }
9908
+
9909
+ // We can handle copying from a type to the same type if it's
9910
+ // contiguous (memcpy). We use f16 or f32 shaders to do the copy,
9911
+ // so the type/block size must be a multiple of 4.
9912
+ if (src0_type == src1_type &&
9913
+ ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op) &&
9914
+ (ggml_type_size(src0_type) % 2) == 0) {
9915
+ return true;
9916
+ }
9917
  return false;
9918
  } break;
9919
  case GGML_OP_REPEAT: