Spaces:
Sleeping
Sleeping
ggml : update `ggml_rope_multi` (llama/12665)
Browse files* update `rope_multi`:
1. add `ggml_rope_multi_inplace`;
1. use `GGML_MROPE_SECTIONS` instead of 4.
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <[email protected]>
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml/include/ggml.h +19 -1
- ggml/src/ggml.c +44 -40
ggml/include/ggml.h
CHANGED
|
@@ -241,6 +241,8 @@
|
|
| 241 |
#define GGML_ROPE_TYPE_MROPE 8
|
| 242 |
#define GGML_ROPE_TYPE_VISION 24
|
| 243 |
|
|
|
|
|
|
|
| 244 |
#define GGML_UNUSED(x) (void)(x)
|
| 245 |
|
| 246 |
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
|
@@ -1660,7 +1662,7 @@ extern "C" {
|
|
| 1660 |
struct ggml_tensor * b,
|
| 1661 |
struct ggml_tensor * c,
|
| 1662 |
int n_dims,
|
| 1663 |
-
int sections[
|
| 1664 |
int mode,
|
| 1665 |
int n_ctx_orig,
|
| 1666 |
float freq_base,
|
|
@@ -1686,6 +1688,22 @@ extern "C" {
|
|
| 1686 |
float beta_fast,
|
| 1687 |
float beta_slow);
|
| 1688 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1689 |
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
| 1690 |
struct ggml_context * ctx,
|
| 1691 |
struct ggml_tensor * a,
|
|
|
|
| 241 |
#define GGML_ROPE_TYPE_MROPE 8
|
| 242 |
#define GGML_ROPE_TYPE_VISION 24
|
| 243 |
|
| 244 |
+
#define GGML_MROPE_SECTIONS 4
|
| 245 |
+
|
| 246 |
#define GGML_UNUSED(x) (void)(x)
|
| 247 |
|
| 248 |
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
|
|
|
|
| 1662 |
struct ggml_tensor * b,
|
| 1663 |
struct ggml_tensor * c,
|
| 1664 |
int n_dims,
|
| 1665 |
+
int sections[GGML_MROPE_SECTIONS],
|
| 1666 |
int mode,
|
| 1667 |
int n_ctx_orig,
|
| 1668 |
float freq_base,
|
|
|
|
| 1688 |
float beta_fast,
|
| 1689 |
float beta_slow);
|
| 1690 |
|
| 1691 |
+
GGML_API struct ggml_tensor * ggml_rope_multi_inplace(
|
| 1692 |
+
struct ggml_context * ctx,
|
| 1693 |
+
struct ggml_tensor * a,
|
| 1694 |
+
struct ggml_tensor * b,
|
| 1695 |
+
struct ggml_tensor * c,
|
| 1696 |
+
int n_dims,
|
| 1697 |
+
int sections[GGML_MROPE_SECTIONS],
|
| 1698 |
+
int mode,
|
| 1699 |
+
int n_ctx_orig,
|
| 1700 |
+
float freq_base,
|
| 1701 |
+
float freq_scale,
|
| 1702 |
+
float ext_factor,
|
| 1703 |
+
float attn_factor,
|
| 1704 |
+
float beta_fast,
|
| 1705 |
+
float beta_slow);
|
| 1706 |
+
|
| 1707 |
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_rope_custom(
|
| 1708 |
struct ggml_context * ctx,
|
| 1709 |
struct ggml_tensor * a,
|
ggml/src/ggml.c
CHANGED
|
@@ -3885,6 +3885,7 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
| 3885 |
struct ggml_tensor * b,
|
| 3886 |
struct ggml_tensor * c,
|
| 3887 |
int n_dims,
|
|
|
|
| 3888 |
int mode,
|
| 3889 |
int n_ctx_orig,
|
| 3890 |
float freq_base,
|
|
@@ -3898,15 +3899,19 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
| 3898 |
|
| 3899 |
GGML_ASSERT(ggml_is_vector(b));
|
| 3900 |
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
| 3901 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3902 |
|
| 3903 |
if (c) {
|
| 3904 |
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
| 3905 |
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
| 3906 |
}
|
| 3907 |
|
| 3908 |
-
int sections[4] = {0, 0, 0, 0};
|
| 3909 |
-
|
| 3910 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 3911 |
|
| 3912 |
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
@@ -3916,7 +3921,11 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
| 3916 |
memcpy(params + 8, &attn_factor, sizeof(float));
|
| 3917 |
memcpy(params + 9, &beta_fast, sizeof(float));
|
| 3918 |
memcpy(params + 10, &beta_slow, sizeof(float));
|
| 3919 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3920 |
ggml_set_op_params(result, params, sizeof(params));
|
| 3921 |
|
| 3922 |
result->op = GGML_OP_ROPE;
|
|
@@ -3934,7 +3943,7 @@ struct ggml_tensor * ggml_rope(
|
|
| 3934 |
int n_dims,
|
| 3935 |
int mode) {
|
| 3936 |
return ggml_rope_impl(
|
| 3937 |
-
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
| 3938 |
);
|
| 3939 |
}
|
| 3940 |
|
|
@@ -3944,7 +3953,7 @@ struct ggml_tensor * ggml_rope_multi(
|
|
| 3944 |
struct ggml_tensor * b,
|
| 3945 |
struct ggml_tensor * c,
|
| 3946 |
int n_dims,
|
| 3947 |
-
int sections[
|
| 3948 |
int mode,
|
| 3949 |
int n_ctx_orig,
|
| 3950 |
float freq_base,
|
|
@@ -3953,36 +3962,31 @@ struct ggml_tensor * ggml_rope_multi(
|
|
| 3953 |
float attn_factor,
|
| 3954 |
float beta_fast,
|
| 3955 |
float beta_slow) {
|
| 3956 |
-
|
| 3957 |
-
|
| 3958 |
-
|
| 3959 |
-
|
| 3960 |
-
|
| 3961 |
-
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
| 3962 |
-
|
| 3963 |
-
if (c) {
|
| 3964 |
-
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
| 3965 |
-
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
| 3966 |
-
}
|
| 3967 |
-
|
| 3968 |
-
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
| 3969 |
-
|
| 3970 |
-
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
| 3971 |
-
memcpy(params + 5, &freq_base, sizeof(float));
|
| 3972 |
-
memcpy(params + 6, &freq_scale, sizeof(float));
|
| 3973 |
-
memcpy(params + 7, &ext_factor, sizeof(float));
|
| 3974 |
-
memcpy(params + 8, &attn_factor, sizeof(float));
|
| 3975 |
-
memcpy(params + 9, &beta_fast, sizeof(float));
|
| 3976 |
-
memcpy(params + 10, &beta_slow, sizeof(float));
|
| 3977 |
-
memcpy(¶ms[11], sections, sizeof(int)*4);
|
| 3978 |
-
ggml_set_op_params(result, params, sizeof(params));
|
| 3979 |
-
|
| 3980 |
-
result->op = GGML_OP_ROPE;
|
| 3981 |
-
result->src[0] = a;
|
| 3982 |
-
result->src[1] = b;
|
| 3983 |
-
result->src[2] = c;
|
| 3984 |
|
| 3985 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3986 |
}
|
| 3987 |
|
| 3988 |
struct ggml_tensor * ggml_rope_inplace(
|
|
@@ -3992,7 +3996,7 @@ struct ggml_tensor * ggml_rope_inplace(
|
|
| 3992 |
int n_dims,
|
| 3993 |
int mode) {
|
| 3994 |
return ggml_rope_impl(
|
| 3995 |
-
ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
| 3996 |
);
|
| 3997 |
}
|
| 3998 |
|
|
@@ -4011,7 +4015,7 @@ struct ggml_tensor * ggml_rope_ext(
|
|
| 4011 |
float beta_fast,
|
| 4012 |
float beta_slow) {
|
| 4013 |
return ggml_rope_impl(
|
| 4014 |
-
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4015 |
ext_factor, attn_factor, beta_fast, beta_slow, false
|
| 4016 |
);
|
| 4017 |
}
|
|
@@ -4031,7 +4035,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
|
|
| 4031 |
float beta_fast,
|
| 4032 |
float beta_slow) {
|
| 4033 |
return ggml_rope_impl(
|
| 4034 |
-
ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4035 |
ext_factor, attn_factor, beta_fast, beta_slow, true
|
| 4036 |
);
|
| 4037 |
}
|
|
@@ -4050,7 +4054,7 @@ struct ggml_tensor * ggml_rope_custom(
|
|
| 4050 |
float beta_fast,
|
| 4051 |
float beta_slow) {
|
| 4052 |
return ggml_rope_impl(
|
| 4053 |
-
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4054 |
ext_factor, attn_factor, beta_fast, beta_slow, false
|
| 4055 |
);
|
| 4056 |
}
|
|
@@ -4069,7 +4073,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
|
|
| 4069 |
float beta_fast,
|
| 4070 |
float beta_slow) {
|
| 4071 |
return ggml_rope_impl(
|
| 4072 |
-
ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4073 |
ext_factor, attn_factor, beta_fast, beta_slow, true
|
| 4074 |
);
|
| 4075 |
}
|
|
|
|
| 3885 |
struct ggml_tensor * b,
|
| 3886 |
struct ggml_tensor * c,
|
| 3887 |
int n_dims,
|
| 3888 |
+
int sections[GGML_MROPE_SECTIONS],
|
| 3889 |
int mode,
|
| 3890 |
int n_ctx_orig,
|
| 3891 |
float freq_base,
|
|
|
|
| 3899 |
|
| 3900 |
GGML_ASSERT(ggml_is_vector(b));
|
| 3901 |
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
| 3902 |
+
|
| 3903 |
+
bool mrope_used = mode & GGML_ROPE_TYPE_MROPE;
|
| 3904 |
+
if (mrope_used) {
|
| 3905 |
+
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
| 3906 |
+
} else {
|
| 3907 |
+
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
| 3908 |
+
}
|
| 3909 |
|
| 3910 |
if (c) {
|
| 3911 |
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
| 3912 |
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
| 3913 |
}
|
| 3914 |
|
|
|
|
|
|
|
| 3915 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 3916 |
|
| 3917 |
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
|
|
|
| 3921 |
memcpy(params + 8, &attn_factor, sizeof(float));
|
| 3922 |
memcpy(params + 9, &beta_fast, sizeof(float));
|
| 3923 |
memcpy(params + 10, &beta_slow, sizeof(float));
|
| 3924 |
+
if (mrope_used) {
|
| 3925 |
+
memcpy(params + 11, sections, sizeof(int32_t) * GGML_MROPE_SECTIONS);
|
| 3926 |
+
} else {
|
| 3927 |
+
memset(params + 11, 0, sizeof(int32_t) * GGML_MROPE_SECTIONS);
|
| 3928 |
+
}
|
| 3929 |
ggml_set_op_params(result, params, sizeof(params));
|
| 3930 |
|
| 3931 |
result->op = GGML_OP_ROPE;
|
|
|
|
| 3943 |
int n_dims,
|
| 3944 |
int mode) {
|
| 3945 |
return ggml_rope_impl(
|
| 3946 |
+
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
|
| 3947 |
);
|
| 3948 |
}
|
| 3949 |
|
|
|
|
| 3953 |
struct ggml_tensor * b,
|
| 3954 |
struct ggml_tensor * c,
|
| 3955 |
int n_dims,
|
| 3956 |
+
int sections[GGML_MROPE_SECTIONS],
|
| 3957 |
int mode,
|
| 3958 |
int n_ctx_orig,
|
| 3959 |
float freq_base,
|
|
|
|
| 3962 |
float attn_factor,
|
| 3963 |
float beta_fast,
|
| 3964 |
float beta_slow) {
|
| 3965 |
+
return ggml_rope_impl(
|
| 3966 |
+
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
|
| 3967 |
+
ext_factor, attn_factor, beta_fast, beta_slow, false
|
| 3968 |
+
);
|
| 3969 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3970 |
|
| 3971 |
+
struct ggml_tensor * ggml_rope_multi_inplace(
|
| 3972 |
+
struct ggml_context * ctx,
|
| 3973 |
+
struct ggml_tensor * a,
|
| 3974 |
+
struct ggml_tensor * b,
|
| 3975 |
+
struct ggml_tensor * c,
|
| 3976 |
+
int n_dims,
|
| 3977 |
+
int sections[GGML_MROPE_SECTIONS],
|
| 3978 |
+
int mode,
|
| 3979 |
+
int n_ctx_orig,
|
| 3980 |
+
float freq_base,
|
| 3981 |
+
float freq_scale,
|
| 3982 |
+
float ext_factor,
|
| 3983 |
+
float attn_factor,
|
| 3984 |
+
float beta_fast,
|
| 3985 |
+
float beta_slow) {
|
| 3986 |
+
return ggml_rope_impl(
|
| 3987 |
+
ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale,
|
| 3988 |
+
ext_factor, attn_factor, beta_fast, beta_slow, true
|
| 3989 |
+
);
|
| 3990 |
}
|
| 3991 |
|
| 3992 |
struct ggml_tensor * ggml_rope_inplace(
|
|
|
|
| 3996 |
int n_dims,
|
| 3997 |
int mode) {
|
| 3998 |
return ggml_rope_impl(
|
| 3999 |
+
ctx, a, b, NULL, n_dims, NULL, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
|
| 4000 |
);
|
| 4001 |
}
|
| 4002 |
|
|
|
|
| 4015 |
float beta_fast,
|
| 4016 |
float beta_slow) {
|
| 4017 |
return ggml_rope_impl(
|
| 4018 |
+
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4019 |
ext_factor, attn_factor, beta_fast, beta_slow, false
|
| 4020 |
);
|
| 4021 |
}
|
|
|
|
| 4035 |
float beta_fast,
|
| 4036 |
float beta_slow) {
|
| 4037 |
return ggml_rope_impl(
|
| 4038 |
+
ctx, a, b, c, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4039 |
ext_factor, attn_factor, beta_fast, beta_slow, true
|
| 4040 |
);
|
| 4041 |
}
|
|
|
|
| 4054 |
float beta_fast,
|
| 4055 |
float beta_slow) {
|
| 4056 |
return ggml_rope_impl(
|
| 4057 |
+
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4058 |
ext_factor, attn_factor, beta_fast, beta_slow, false
|
| 4059 |
);
|
| 4060 |
}
|
|
|
|
| 4073 |
float beta_fast,
|
| 4074 |
float beta_slow) {
|
| 4075 |
return ggml_rope_impl(
|
| 4076 |
+
ctx, a, b, NULL, n_dims, NULL, mode, n_ctx_orig, freq_base, freq_scale,
|
| 4077 |
ext_factor, attn_factor, beta_fast, beta_slow, true
|
| 4078 |
);
|
| 4079 |
}
|