Judd ggerganov commited on
Commit
b4896dc
·
1 Parent(s): db4407f

ggml : update `ggml_rope_multi` (llama/12665)

Browse files

* update `rope_multi`:

1. add `ggml_rope_multi_inplace`;
1. use `GGML_MROPE_SECTIONS` instead of 4.

* Apply suggestions from code review

Co-authored-by: Georgi Gerganov <[email protected]>

---------

Co-authored-by: Georgi Gerganov <[email protected]>

Files changed (2) hide show
  1. ggml/include/ggml.h +19 -1
  2. ggml/src/ggml.c +44 -40
ggml/include/ggml.h CHANGED
@@ -241,6 +241,8 @@
241
  #define GGML_ROPE_TYPE_MROPE 8
242
  #define GGML_ROPE_TYPE_VISION 24
243
 
 
 
244
  #define GGML_UNUSED(x) (void)(x)
245
 
246
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -1660,7 +1662,7 @@ extern "C" {
1660
  struct ggml_tensor * b,
1661
  struct ggml_tensor * c,
1662
  int n_dims,
1663
- int sections[4],
1664
  int mode,
1665
  int n_ctx_orig,
1666
  float freq_base,
@@ -1686,6 +1688,22 @@ extern "C" {
1686
  float beta_fast,
1687
  float beta_slow);
1688
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1689
  GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1690
  struct ggml_context * ctx,
1691
  struct ggml_tensor * a,
 
241
  #define GGML_ROPE_TYPE_MROPE 8
242
  #define GGML_ROPE_TYPE_VISION 24
243
 
244
+ #define GGML_MROPE_SECTIONS 4
245
+
246
  #define GGML_UNUSED(x) (void)(x)
247
 
248
  #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 
1662
  struct ggml_tensor * b,
1663
  struct ggml_tensor * c,
1664
  int n_dims,
1665
+ int sections[GGML_MROPE_SECTIONS],
1666
  int mode,
1667
  int n_ctx_orig,
1668
  float freq_base,
 
1688
  float beta_fast,
1689
  float beta_slow);
1690
 
1691
+ GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
1692
+ struct ggml_context * ctx,
1693
+ struct ggml_tensor * a,
1694
+ struct ggml_tensor * b,
1695
+ struct ggml_tensor * c,
1696
+ int n_dims,
1697
+ int sections[GGML_MROPE_SECTIONS],
1698
+ int mode,
1699
+ int n_ctx_orig,
1700
+ float freq_base,
1701
+ float freq_scale,
1702
+ float ext_factor,
1703
+ float attn_factor,
1704
+ float beta_fast,
1705
+ float beta_slow);
1706
+
1707
  GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
1708
  struct ggml_context * ctx,
1709
  struct ggml_tensor * a,
ggml/src/ggml.c CHANGED
@@ -3885,6 +3885,7 @@ static struct ggml_tensor * ggml_rope_impl(
3885
  struct ggml_tensor * b,
3886
  struct ggml_tensor * c,
3887
  int n_dims,
 
3888
  int mode,
3889
  int n_ctx_orig,
3890
  float freq_base,
@@ -3898,15 +3899,19 @@ static struct ggml_tensor * ggml_rope_impl(
3898
 
3899
  GGML_ASSERT(ggml_is_vector(b));
3900
  GGML_ASSERT(b->type == GGML_TYPE_I32);
3901
- GGML_ASSERT(a->ne[2] == b->ne[0]);
 
 
 
 
 
 
3902
 
3903
  if (c) {
3904
  GGML_ASSERT(c->type == GGML_TYPE_F32);
3905
  GGML_ASSERT(c->ne[0] >= n_dims / 2);
3906
  }
3907
 
3908
- int sections[4] = {0, 0, 0, 0};
3909
-
3910
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3911
 
3912
  int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
@@ -3916,7 +3921,11 @@ static struct ggml_tensor * ggml_rope_impl(
3916
  memcpy(params + 8, &attn_factor, sizeof(float));
3917
  memcpy(params + 9, &beta_fast, sizeof(float));
3918
  memcpy(params + 10, &beta_slow, sizeof(float));
3919
- memcpy(params + 11, &sections, sizeof(int)*4);
 
 
 
 
3920
  ggml_set_op_params(result, params, sizeof(params));
3921
 
3922
  result->op = GGML_OP_ROPE;
@@ -3934,7 +3943,7 @@ struct ggml_tensor * ggml_rope(
3934
  int n_dims,
3935
  int mode) {
3936
  return ggml_rope_impl(
3937
- ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
3938
  );
3939
  }
3940
 
@@ -3944,7 +3953,7 @@ struct ggml_tensor * ggml_rope_multi(
3944
  struct ggml_tensor * b,
3945
  struct ggml_tensor * c,
3946
  int n_dims,
3947
- int sections[4],
3948
  int mode,
3949
  int n_ctx_orig,
3950
  float freq_base,
@@ -3953,36 +3962,31 @@ struct ggml_tensor * ggml_rope_multi(
3953
  float attn_factor,
3954
  float beta_fast,
3955
  float beta_slow) {
3956
- // Multimodal Rotary Position Embedding
3957
- GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
3958
-
3959
- GGML_ASSERT(ggml_is_vector(b));
3960
- GGML_ASSERT(b->type == GGML_TYPE_I32);
3961
- GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
3962
-
3963
- if (c) {
3964
- GGML_ASSERT(c->type == GGML_TYPE_F32);
3965
- GGML_ASSERT(c->ne[0] >= n_dims / 2);
3966
- }
3967
-
3968
- struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
3969
-
3970
- int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3971
- memcpy(params + 5, &freq_base, sizeof(float));
3972
- memcpy(params + 6, &freq_scale, sizeof(float));
3973
- memcpy(params + 7, &ext_factor, sizeof(float));
3974
- memcpy(params + 8, &attn_factor, sizeof(float));
3975
- memcpy(params + 9, &beta_fast, sizeof(float));
3976
- memcpy(params + 10, &beta_slow, sizeof(float));
3977
- memcpy(&params[11], sections, sizeof(int)*4);
3978
- ggml_set_op_params(result, params, sizeof(params));
3979
-
3980
- result->op = GGML_OP_ROPE;
3981
- result->src[0] = a;
3982
- result->src[1] = b;
3983
- result->src[2] = c;
3984
 
3985
- return result;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3986
  }
3987
 
3988
  struct ggml_tensor * ggml_rope_inplace(
@@ -3992,7 +3996,7 @@ struct ggml_tensor * ggml_rope_inplace(
3992
  int n_dims,
3993
  int mode) {
3994
  return ggml_rope_impl(
3995
- ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
3996
  );
3997
  }
3998
 
@@ -4011,7 +4015,7 @@ struct ggml_tensor * ggml_rope_ext(
4011
  float beta_fast,
4012
  float beta_slow) {
4013
  return ggml_rope_impl(
4014
- ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4015
  ext_factor, attn_factor, beta_fast, beta_slow, false
4016
  );
4017
  }
@@ -4031,7 +4035,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
4031
  float beta_fast,
4032
  float beta_slow) {
4033
  return ggml_rope_impl(
4034
- ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4035
  ext_factor, attn_factor, beta_fast, beta_slow, true
4036
  );
4037
  }
@@ -4050,7 +4054,7 @@ struct ggml_tensor * ggml_rope_custom(
4050
  float beta_fast,
4051
  float beta_slow) {
4052
  return ggml_rope_impl(
4053
- ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4054
  ext_factor, attn_factor, beta_fast, beta_slow, false
4055
  );
4056
  }
@@ -4069,7 +4073,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
4069
  float beta_fast,
4070
  float beta_slow) {
4071
  return ggml_rope_impl(
4072
- ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
4073
  ext_factor, attn_factor, beta_fast, beta_slow, true
4074
  );
4075
  }
 
3885
  struct ggml_tensor * b,
3886
  struct ggml_tensor * c,
3887
  int n_dims,
3888
+ int sections[GGML_MROPE_SECTIONS],
3889
  int mode,
3890
  int n_ctx_orig,
3891
  float freq_base,
 
3899
 
3900
  GGML_ASSERT(ggml_is_vector(b));
3901
  GGML_ASSERT(b->type == GGML_TYPE_I32);
3902
+
3903
+ bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
3904
+ if (mrope_used) {
3905
+ GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
3906
+ } else {
3907
+ GGML_ASSERT(a->ne[2] == b->ne[0]);
3908
+ }
3909
 
3910
  if (c) {
3911
  GGML_ASSERT(c->type == GGML_TYPE_F32);
3912
  GGML_ASSERT(c->ne[0] >= n_dims / 2);
3913
  }
3914
 
 
 
3915
  struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
3916
 
3917
  int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
 
3921
  memcpy(params + 8, &attn_factor, sizeof(float));
3922
  memcpy(params + 9, &beta_fast, sizeof(float));
3923
  memcpy(params + 10, &beta_slow, sizeof(float));
3924
+ if (mrope_used) {
3925
+ memcpy(params + 11, sections, sizeof(int32_t) * GGML_MROPE_SECTIONS);
3926
+ } else {
3927
+ memset(params + 11, 0, sizeof(int32_t) * GGML_MROPE_SECTIONS);
3928
+ }
3929
  ggml_set_op_params(result, params, sizeof(params));
3930
 
3931
  result->op = GGML_OP_ROPE;
 
3943
  int n_dims,
3944
  int mode) {
3945
  return ggml_rope_impl(
3946
+ ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
3947
  );
3948
  }
3949
 
 
3953
  struct ggml_tensor * b,
3954
  struct ggml_tensor * c,
3955
  int n_dims,
3956
+ int sections[GGML_MROPE_SECTIONS],
3957
  int mode,
3958
  int n_ctx_orig,
3959
  float freq_base,
 
3962
  float attn_factor,
3963
  float beta_fast,
3964
  float beta_slow) {
3965
+ return ggml_rope_impl(
3966
+ ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
3967
+ ext_factor, attn_factor, beta_fast, beta_slow, false
3968
+ );
3969
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3970
 
3971
+ struct ggml_tensor * ggml_rope_multi_inplace(
3972
+ struct ggml_context * ctx,
3973
+ struct ggml_tensor * a,
3974
+ struct ggml_tensor * b,
3975
+ struct ggml_tensor * c,
3976
+ int n_dims,
3977
+ int sections[GGML_MROPE_SECTIONS],
3978
+ int mode,
3979
+ int n_ctx_orig,
3980
+ float freq_base,
3981
+ float freq_scale,
3982
+ float ext_factor,
3983
+ float attn_factor,
3984
+ float beta_fast,
3985
+ float beta_slow) {
3986
+ return ggml_rope_impl(
3987
+ ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
3988
+ ext_factor, attn_factor, beta_fast, beta_slow, true
3989
+ );
3990
  }
3991
 
3992
  struct ggml_tensor * ggml_rope_inplace(
 
3996
  int n_dims,
3997
  int mode) {
3998
  return ggml_rope_impl(
3999
+ ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
4000
  );
4001
  }
4002
 
 
4015
  float beta_fast,
4016
  float beta_slow) {
4017
  return ggml_rope_impl(
4018
+ ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4019
  ext_factor, attn_factor, beta_fast, beta_slow, false
4020
  );
4021
  }
 
4035
  float beta_fast,
4036
  float beta_slow) {
4037
  return ggml_rope_impl(
4038
+ ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4039
  ext_factor, attn_factor, beta_fast, beta_slow, true
4040
  );
4041
  }
 
4054
  float beta_fast,
4055
  float beta_slow) {
4056
  return ggml_rope_impl(
4057
+ ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4058
  ext_factor, attn_factor, beta_fast, beta_slow, false
4059
  );
4060
  }
 
4073
  float beta_fast,
4074
  float beta_slow) {
4075
  return ggml_rope_impl(
4076
+ ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
4077
  ext_factor, attn_factor, beta_fast, beta_slow, true
4078
  );
4079
  }