Spaces:
Running
llama : add Qwen2VL support + multimodal RoPE (llama/10361)
Browse files* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <[email protected]>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <[email protected]>
- ggml/include/ggml.h +19 -1
- ggml/src/ggml-cann/ggml-cann.cpp +9 -0
- ggml/src/ggml-cpu/ggml-cpu.c +211 -33
- ggml/src/ggml-cuda/rope.cu +229 -2
- ggml/src/ggml-kompute/ggml-kompute.cpp +11 -1
- ggml/src/ggml-metal/ggml-metal.m +14 -2
- ggml/src/ggml-sycl/ggml-sycl.cpp +10 -1
- ggml/src/ggml-vulkan/ggml-vulkan.cpp +10 -1
- ggml/src/ggml.c +51 -1
|
@@ -237,7 +237,9 @@
|
|
| 237 |
#define GGML_EXIT_SUCCESS 0
|
| 238 |
#define GGML_EXIT_ABORTED 1
|
| 239 |
|
| 240 |
-
#define GGML_ROPE_TYPE_NEOX
|
|
|
|
|
|
|
| 241 |
|
| 242 |
#define GGUF_MAGIC "GGUF"
|
| 243 |
|
|
@@ -1443,6 +1445,22 @@ extern "C" {
|
|
| 1443 |
float beta_fast,
|
| 1444 |
float beta_slow);
|
| 1445 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1446 |
// in-place, returns view(a)
|
| 1447 |
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
| 1448 |
struct ggml_context * ctx,
|
|
|
|
| 237 |
#define GGML_EXIT_SUCCESS 0
|
| 238 |
#define GGML_EXIT_ABORTED 1
|
| 239 |
|
| 240 |
+
#define GGML_ROPE_TYPE_NEOX 2
|
| 241 |
+
#define GGML_ROPE_TYPE_MROPE 8
|
| 242 |
+
#define GGML_ROPE_TYPE_VISION 24
|
| 243 |
|
| 244 |
#define GGUF_MAGIC "GGUF"
|
| 245 |
|
|
|
|
| 1445 |
float beta_fast,
|
| 1446 |
float beta_slow);
|
| 1447 |
|
| 1448 |
+
GGML_API struct ggml_tensor * ggml_rope_multi(
|
| 1449 |
+
struct ggml_context * ctx,
|
| 1450 |
+
struct ggml_tensor * a,
|
| 1451 |
+
struct ggml_tensor * b,
|
| 1452 |
+
struct ggml_tensor * c,
|
| 1453 |
+
int n_dims,
|
| 1454 |
+
int sections[4],
|
| 1455 |
+
int mode,
|
| 1456 |
+
int n_ctx_orig,
|
| 1457 |
+
float freq_base,
|
| 1458 |
+
float freq_scale,
|
| 1459 |
+
float ext_factor,
|
| 1460 |
+
float attn_factor,
|
| 1461 |
+
float beta_fast,
|
| 1462 |
+
float beta_slow);
|
| 1463 |
+
|
| 1464 |
// in-place, returns view(a)
|
| 1465 |
GGML_API struct ggml_tensor * ggml_rope_ext_inplace(
|
| 1466 |
struct ggml_context * ctx,
|
|
@@ -1747,6 +1747,15 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 1747 |
if (*ext_factor != 0) {
|
| 1748 |
return false;
|
| 1749 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1750 |
return true;
|
| 1751 |
}
|
| 1752 |
case GGML_OP_UPSCALE: {
|
|
|
|
| 1747 |
if (*ext_factor != 0) {
|
| 1748 |
return false;
|
| 1749 |
}
|
| 1750 |
+
|
| 1751 |
+
const int mode = ((const int32_t *) op->op_params)[2];
|
| 1752 |
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
| 1753 |
+
return false;
|
| 1754 |
+
}
|
| 1755 |
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
| 1756 |
+
return false;
|
| 1757 |
+
}
|
| 1758 |
+
|
| 1759 |
return true;
|
| 1760 |
}
|
| 1761 |
case GGML_OP_UPSCALE: {
|
|
@@ -9133,6 +9133,64 @@ static void ggml_rope_cache_init(
|
|
| 9133 |
}
|
| 9134 |
}
|
| 9135 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9136 |
static void ggml_compute_forward_rope_f32(
|
| 9137 |
const struct ggml_compute_params * params,
|
| 9138 |
struct ggml_tensor * dst,
|
|
@@ -9143,6 +9201,7 @@ static void ggml_compute_forward_rope_f32(
|
|
| 9143 |
const struct ggml_tensor * src2 = dst->src[2];
|
| 9144 |
|
| 9145 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
|
|
| 9146 |
|
| 9147 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
| 9148 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
@@ -9156,6 +9215,7 @@ static void ggml_compute_forward_rope_f32(
|
|
| 9156 |
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
| 9157 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 9158 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
|
|
| 9159 |
|
| 9160 |
GGML_TENSOR_UNARY_OP_LOCALS
|
| 9161 |
|
|
@@ -9188,6 +9248,16 @@ static void ggml_compute_forward_rope_f32(
|
|
| 9188 |
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
| 9189 |
|
| 9190 |
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9191 |
|
| 9192 |
const float * freq_factors = NULL;
|
| 9193 |
if (src2 != NULL) {
|
|
@@ -9203,18 +9273,63 @@ static void ggml_compute_forward_rope_f32(
|
|
| 9203 |
|
| 9204 |
const int32_t * pos = (const int32_t *) src1->data;
|
| 9205 |
|
| 9206 |
-
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
| 9207 |
-
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
| 9208 |
-
const int64_t p = pos[i2];
|
| 9209 |
|
| 9210 |
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
| 9211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9212 |
|
| 9213 |
-
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
| 9214 |
if (ir++ < ir0) continue;
|
| 9215 |
if (ir > ir1) break;
|
| 9216 |
|
| 9217 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9218 |
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9219 |
const float cos_theta = cache[i0 + 0];
|
| 9220 |
const float sin_theta = cache[i0 + 1];
|
|
@@ -9228,8 +9343,10 @@ static void ggml_compute_forward_rope_f32(
|
|
| 9228 |
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 9229 |
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
| 9230 |
}
|
| 9231 |
-
}
|
| 9232 |
-
|
|
|
|
|
|
|
| 9233 |
const int64_t ic = i0/2;
|
| 9234 |
|
| 9235 |
const float cos_theta = cache[i0 + 0];
|
|
@@ -9239,19 +9356,20 @@ static void ggml_compute_forward_rope_f32(
|
|
| 9239 |
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9240 |
|
| 9241 |
const float x0 = src[0];
|
| 9242 |
-
const float x1 = src[n_dims
|
| 9243 |
|
| 9244 |
-
dst_data[0]
|
| 9245 |
-
dst_data[n_dims
|
| 9246 |
}
|
| 9247 |
-
}
|
| 9248 |
-
|
| 9249 |
-
|
| 9250 |
-
|
| 9251 |
-
|
| 9252 |
|
| 9253 |
-
|
| 9254 |
-
|
|
|
|
| 9255 |
}
|
| 9256 |
}
|
| 9257 |
}
|
|
@@ -9269,6 +9387,7 @@ static void ggml_compute_forward_rope_f16(
|
|
| 9269 |
const struct ggml_tensor * src2 = dst->src[2];
|
| 9270 |
|
| 9271 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
|
|
|
| 9272 |
|
| 9273 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
| 9274 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
@@ -9281,6 +9400,8 @@ static void ggml_compute_forward_rope_f16(
|
|
| 9281 |
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
| 9282 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 9283 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
|
|
|
|
|
| 9284 |
|
| 9285 |
GGML_TENSOR_UNARY_OP_LOCALS
|
| 9286 |
|
|
@@ -9313,6 +9434,16 @@ static void ggml_compute_forward_rope_f16(
|
|
| 9313 |
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
| 9314 |
|
| 9315 |
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9316 |
|
| 9317 |
const float * freq_factors = NULL;
|
| 9318 |
if (src2 != NULL) {
|
|
@@ -9330,16 +9461,61 @@ static void ggml_compute_forward_rope_f16(
|
|
| 9330 |
|
| 9331 |
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
| 9332 |
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
| 9333 |
-
const int64_t p = pos[i2];
|
| 9334 |
|
| 9335 |
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
| 9336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9337 |
|
| 9338 |
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
| 9339 |
if (ir++ < ir0) continue;
|
| 9340 |
if (ir > ir1) break;
|
| 9341 |
|
| 9342 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9343 |
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9344 |
const float cos_theta = cache[i0 + 0];
|
| 9345 |
const float sin_theta = cache[i0 + 1];
|
|
@@ -9353,8 +9529,10 @@ static void ggml_compute_forward_rope_f16(
|
|
| 9353 |
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
| 9354 |
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
| 9355 |
}
|
| 9356 |
-
}
|
| 9357 |
-
|
|
|
|
|
|
|
| 9358 |
const int64_t ic = i0/2;
|
| 9359 |
|
| 9360 |
const float cos_theta = cache[i0 + 0];
|
|
@@ -9364,19 +9542,19 @@ static void ggml_compute_forward_rope_f16(
|
|
| 9364 |
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9365 |
|
| 9366 |
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
| 9367 |
-
const float x1 = GGML_FP16_TO_FP32(src[n_dims
|
| 9368 |
|
| 9369 |
-
dst_data[0]
|
| 9370 |
-
dst_data[n_dims
|
| 9371 |
}
|
| 9372 |
-
}
|
| 9373 |
-
|
| 9374 |
-
|
| 9375 |
-
|
| 9376 |
-
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
| 9377 |
|
| 9378 |
-
|
| 9379 |
-
|
|
|
|
| 9380 |
}
|
| 9381 |
}
|
| 9382 |
}
|
|
|
|
| 9133 |
}
|
| 9134 |
}
|
| 9135 |
|
| 9136 |
+
static void ggml_mrope_cache_init(
|
| 9137 |
+
float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
|
| 9138 |
+
float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
| 9139 |
+
float * cache, float sin_sign, float theta_scale) {
|
| 9140 |
+
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
| 9141 |
+
float theta_t = theta_base_t;
|
| 9142 |
+
float theta_h = theta_base_h;
|
| 9143 |
+
float theta_w = theta_base_w;
|
| 9144 |
+
float theta_e = theta_base_e; // extra position id for vision encoder
|
| 9145 |
+
int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
|
| 9146 |
+
int sec_w = sections[1] + sections[0];
|
| 9147 |
+
int sec_e = sections[2] + sec_w;
|
| 9148 |
+
GGML_ASSERT(sect_dims <= ne0);
|
| 9149 |
+
|
| 9150 |
+
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
| 9151 |
+
const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
|
| 9152 |
+
|
| 9153 |
+
int sector = (i0 / 2) % sect_dims;
|
| 9154 |
+
if (indep_sects) {
|
| 9155 |
+
// compute theta independently for each dim sections
|
| 9156 |
+
// (i.e. reset corresponding theta when `i0` go from one section to another)
|
| 9157 |
+
if (sector == 0) {
|
| 9158 |
+
theta_t = theta_base_t;
|
| 9159 |
+
}
|
| 9160 |
+
else if (sector == sections[0]) {
|
| 9161 |
+
theta_h = theta_base_h;;
|
| 9162 |
+
}
|
| 9163 |
+
else if (sector == sec_w) {
|
| 9164 |
+
theta_w = theta_base_w;
|
| 9165 |
+
}
|
| 9166 |
+
else if (sector == sec_e) {
|
| 9167 |
+
theta_e = theta_base_e;
|
| 9168 |
+
}
|
| 9169 |
+
}
|
| 9170 |
+
|
| 9171 |
+
float theta = theta_t;
|
| 9172 |
+
if (sector >= sections[0] && sector < sec_w) {
|
| 9173 |
+
theta = theta_h;
|
| 9174 |
+
}
|
| 9175 |
+
else if (sector >= sec_w && sector < sec_w + sections[2]) {
|
| 9176 |
+
theta = theta_w;
|
| 9177 |
+
}
|
| 9178 |
+
else if (sector >= sec_w + sections[2]) {
|
| 9179 |
+
theta = theta_e;
|
| 9180 |
+
}
|
| 9181 |
+
|
| 9182 |
+
rope_yarn(
|
| 9183 |
+
theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
|
| 9184 |
+
);
|
| 9185 |
+
cache[i0 + 1] *= sin_sign;
|
| 9186 |
+
|
| 9187 |
+
theta_t *= theta_scale;
|
| 9188 |
+
theta_w *= theta_scale;
|
| 9189 |
+
theta_h *= theta_scale;
|
| 9190 |
+
theta_e *= theta_scale;
|
| 9191 |
+
}
|
| 9192 |
+
}
|
| 9193 |
+
|
| 9194 |
static void ggml_compute_forward_rope_f32(
|
| 9195 |
const struct ggml_compute_params * params,
|
| 9196 |
struct ggml_tensor * dst,
|
|
|
|
| 9201 |
const struct ggml_tensor * src2 = dst->src[2];
|
| 9202 |
|
| 9203 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
| 9204 |
+
int sections[4];
|
| 9205 |
|
| 9206 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
| 9207 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
|
|
| 9215 |
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
| 9216 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 9217 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
| 9218 |
+
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
| 9219 |
|
| 9220 |
GGML_TENSOR_UNARY_OP_LOCALS
|
| 9221 |
|
|
|
|
| 9248 |
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
| 9249 |
|
| 9250 |
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
| 9251 |
+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE; // ggml_rope_multi, multimodal rotary position embedding
|
| 9252 |
+
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
| 9253 |
+
|
| 9254 |
+
if (is_mrope) {
|
| 9255 |
+
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
| 9256 |
+
}
|
| 9257 |
+
|
| 9258 |
+
if (is_vision) {
|
| 9259 |
+
GGML_ASSERT(n_dims == ne0/2);
|
| 9260 |
+
}
|
| 9261 |
|
| 9262 |
const float * freq_factors = NULL;
|
| 9263 |
if (src2 != NULL) {
|
|
|
|
| 9273 |
|
| 9274 |
const int32_t * pos = (const int32_t *) src1->data;
|
| 9275 |
|
| 9276 |
+
for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
|
| 9277 |
+
for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
|
|
|
|
| 9278 |
|
| 9279 |
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
| 9280 |
+
if (!is_mrope) {
|
| 9281 |
+
const int64_t p = pos[i2];
|
| 9282 |
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
| 9283 |
+
}
|
| 9284 |
+
else {
|
| 9285 |
+
const int64_t p_t = pos[i2];
|
| 9286 |
+
const int64_t p_h = pos[i2 + ne2];
|
| 9287 |
+
const int64_t p_w = pos[i2 + ne2 * 2];
|
| 9288 |
+
const int64_t p_e = pos[i2 + ne2 * 3];
|
| 9289 |
+
ggml_mrope_cache_init(
|
| 9290 |
+
p_t, p_h, p_w, p_e, sections, is_vision,
|
| 9291 |
+
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
| 9292 |
+
}
|
| 9293 |
|
| 9294 |
+
for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
|
| 9295 |
if (ir++ < ir0) continue;
|
| 9296 |
if (ir > ir1) break;
|
| 9297 |
|
| 9298 |
+
if (is_neox || is_mrope) {
|
| 9299 |
+
if (is_vision){
|
| 9300 |
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9301 |
+
const int64_t ic = i0/2;
|
| 9302 |
+
|
| 9303 |
+
const float cos_theta = cache[i0 + 0];
|
| 9304 |
+
const float sin_theta = cache[i0 + 1];
|
| 9305 |
+
|
| 9306 |
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
| 9307 |
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9308 |
+
|
| 9309 |
+
const float x0 = src[0];
|
| 9310 |
+
const float x1 = src[n_dims];
|
| 9311 |
+
|
| 9312 |
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 9313 |
+
dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
|
| 9314 |
+
}
|
| 9315 |
+
} else {
|
| 9316 |
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9317 |
+
const int64_t ic = i0/2;
|
| 9318 |
+
|
| 9319 |
+
const float cos_theta = cache[i0 + 0];
|
| 9320 |
+
const float sin_theta = cache[i0 + 1];
|
| 9321 |
+
|
| 9322 |
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
| 9323 |
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9324 |
+
|
| 9325 |
+
const float x0 = src[0];
|
| 9326 |
+
const float x1 = src[n_dims/2];
|
| 9327 |
+
|
| 9328 |
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 9329 |
+
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 9330 |
+
}
|
| 9331 |
+
}
|
| 9332 |
+
} else {
|
| 9333 |
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9334 |
const float cos_theta = cache[i0 + 0];
|
| 9335 |
const float sin_theta = cache[i0 + 1];
|
|
|
|
| 9343 |
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 9344 |
dst_data[1] = x0*sin_theta + x1*cos_theta;
|
| 9345 |
}
|
| 9346 |
+
}
|
| 9347 |
+
|
| 9348 |
+
if (is_vision) {
|
| 9349 |
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
| 9350 |
const int64_t ic = i0/2;
|
| 9351 |
|
| 9352 |
const float cos_theta = cache[i0 + 0];
|
|
|
|
| 9356 |
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9357 |
|
| 9358 |
const float x0 = src[0];
|
| 9359 |
+
const float x1 = src[n_dims];
|
| 9360 |
|
| 9361 |
+
dst_data[0] = x0*cos_theta - x1*sin_theta;
|
| 9362 |
+
dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
|
| 9363 |
}
|
| 9364 |
+
} else {
|
| 9365 |
+
// fill the remain channels with data from src tensor
|
| 9366 |
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
| 9367 |
+
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
| 9368 |
+
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
| 9369 |
|
| 9370 |
+
dst_data[0] = src[0];
|
| 9371 |
+
dst_data[1] = src[1];
|
| 9372 |
+
}
|
| 9373 |
}
|
| 9374 |
}
|
| 9375 |
}
|
|
|
|
| 9387 |
const struct ggml_tensor * src2 = dst->src[2];
|
| 9388 |
|
| 9389 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
| 9390 |
+
int sections[4];
|
| 9391 |
|
| 9392 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
| 9393 |
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
|
|
| 9400 |
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
| 9401 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 9402 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
| 9403 |
+
memcpy(§ions, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
| 9404 |
+
|
| 9405 |
|
| 9406 |
GGML_TENSOR_UNARY_OP_LOCALS
|
| 9407 |
|
|
|
|
| 9434 |
ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
|
| 9435 |
|
| 9436 |
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
| 9437 |
+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
| 9438 |
+
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
| 9439 |
+
|
| 9440 |
+
if (is_mrope) {
|
| 9441 |
+
GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
|
| 9442 |
+
}
|
| 9443 |
+
|
| 9444 |
+
if (is_vision) {
|
| 9445 |
+
GGML_ASSERT(n_dims == ne0/2);
|
| 9446 |
+
}
|
| 9447 |
|
| 9448 |
const float * freq_factors = NULL;
|
| 9449 |
if (src2 != NULL) {
|
|
|
|
| 9461 |
|
| 9462 |
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
| 9463 |
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
|
|
|
| 9464 |
|
| 9465 |
float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
|
| 9466 |
+
if (!is_mrope) {
|
| 9467 |
+
const int64_t p = pos[i2];
|
| 9468 |
+
ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
| 9469 |
+
}
|
| 9470 |
+
else {
|
| 9471 |
+
const int64_t p_t = pos[i2];
|
| 9472 |
+
const int64_t p_h = pos[i2 + ne2];
|
| 9473 |
+
const int64_t p_w = pos[i2 + ne2 * 2];
|
| 9474 |
+
const int64_t p_e = pos[i2 + ne2 * 3];
|
| 9475 |
+
ggml_mrope_cache_init(
|
| 9476 |
+
p_t, p_h, p_w, p_e, sections, is_vision,
|
| 9477 |
+
freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
|
| 9478 |
+
}
|
| 9479 |
|
| 9480 |
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
| 9481 |
if (ir++ < ir0) continue;
|
| 9482 |
if (ir > ir1) break;
|
| 9483 |
|
| 9484 |
+
if (is_neox || is_mrope) {
|
| 9485 |
+
if (is_vision) {
|
| 9486 |
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9487 |
+
const int64_t ic = i0/2;
|
| 9488 |
+
|
| 9489 |
+
const float cos_theta = cache[i0 + 0];
|
| 9490 |
+
const float sin_theta = cache[i0 + 1];
|
| 9491 |
+
|
| 9492 |
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
| 9493 |
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9494 |
+
|
| 9495 |
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
| 9496 |
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
|
| 9497 |
+
|
| 9498 |
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
| 9499 |
+
dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
| 9500 |
+
}
|
| 9501 |
+
} else {
|
| 9502 |
+
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9503 |
+
const int64_t ic = i0/2;
|
| 9504 |
+
|
| 9505 |
+
const float cos_theta = cache[i0 + 0];
|
| 9506 |
+
const float sin_theta = cache[i0 + 1];
|
| 9507 |
+
|
| 9508 |
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
|
| 9509 |
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9510 |
+
|
| 9511 |
+
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
| 9512 |
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
|
| 9513 |
+
|
| 9514 |
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
| 9515 |
+
dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
| 9516 |
+
}
|
| 9517 |
+
}
|
| 9518 |
+
} else {
|
| 9519 |
for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
|
| 9520 |
const float cos_theta = cache[i0 + 0];
|
| 9521 |
const float sin_theta = cache[i0 + 1];
|
|
|
|
| 9529 |
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
| 9530 |
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
| 9531 |
}
|
| 9532 |
+
}
|
| 9533 |
+
|
| 9534 |
+
if (is_vision) {
|
| 9535 |
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
| 9536 |
const int64_t ic = i0/2;
|
| 9537 |
|
| 9538 |
const float cos_theta = cache[i0 + 0];
|
|
|
|
| 9542 |
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + ic*nb0);
|
| 9543 |
|
| 9544 |
const float x0 = GGML_FP16_TO_FP32(src[0]);
|
| 9545 |
+
const float x1 = GGML_FP16_TO_FP32(src[n_dims]);
|
| 9546 |
|
| 9547 |
+
dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
|
| 9548 |
+
dst_data[n_dims] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
| 9549 |
}
|
| 9550 |
+
} else {
|
| 9551 |
+
for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
|
| 9552 |
+
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
| 9553 |
+
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
|
|
| 9554 |
|
| 9555 |
+
dst_data[0] = src[0];
|
| 9556 |
+
dst_data[1] = src[1];
|
| 9557 |
+
}
|
| 9558 |
}
|
| 9559 |
}
|
| 9560 |
}
|
|
@@ -4,6 +4,11 @@ struct rope_corr_dims {
|
|
| 4 |
float v[2];
|
| 5 |
};
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
| 8 |
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
| 9 |
return 1.0f - min(1.0f, max(0.0f, y));
|
|
@@ -108,6 +113,105 @@ static __global__ void rope_neox(
|
|
| 108 |
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 109 |
}
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
template<typename T>
|
| 112 |
static void rope_norm_cuda(
|
| 113 |
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
|
@@ -156,6 +260,56 @@ static void rope_neox_cuda(
|
|
| 156 |
}
|
| 157 |
}
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
static void rope_norm_cuda_f16(
|
| 160 |
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 161 |
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
|
@@ -185,6 +339,38 @@ static void rope_neox_cuda_f32(
|
|
| 185 |
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
|
| 186 |
}
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 189 |
const ggml_tensor * src0 = dst->src[0];
|
| 190 |
const ggml_tensor * src1 = dst->src[1];
|
|
@@ -201,8 +387,9 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 201 |
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
| 202 |
GGML_ASSERT(src0->type == dst->type);
|
| 203 |
|
| 204 |
-
const int64_t ne00 = src0->ne[0];
|
| 205 |
-
const int64_t ne01 = src0->ne[1];
|
|
|
|
| 206 |
const int64_t nr = ggml_nrows(src0);
|
| 207 |
|
| 208 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
@@ -210,6 +397,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 210 |
const int mode = ((int32_t *) dst->op_params)[2];
|
| 211 |
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
| 212 |
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
|
|
|
| 213 |
|
| 214 |
// RoPE alteration for extended context
|
| 215 |
float freq_base;
|
|
@@ -225,8 +413,19 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 225 |
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
| 226 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 227 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
|
|
|
| 228 |
|
| 229 |
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
const int32_t * pos = (const int32_t *) src1_d;
|
| 232 |
|
|
@@ -253,6 +452,34 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|
| 253 |
} else {
|
| 254 |
GGML_ABORT("fatal error");
|
| 255 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
} else {
|
| 257 |
if (src0->type == GGML_TYPE_F32) {
|
| 258 |
rope_norm_cuda_f32(
|
|
|
|
| 4 |
float v[2];
|
| 5 |
};
|
| 6 |
|
| 7 |
+
|
| 8 |
+
struct mrope_sections {
|
| 9 |
+
int v[4];
|
| 10 |
+
};
|
| 11 |
+
|
| 12 |
static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
|
| 13 |
const float y = (i0 / 2 - low) / max(0.001f, high - low);
|
| 14 |
return 1.0f - min(1.0f, max(0.0f, y));
|
|
|
|
| 113 |
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 114 |
}
|
| 115 |
|
| 116 |
+
template<typename T, bool has_ff>
|
| 117 |
+
static __global__ void rope_multi(
|
| 118 |
+
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 119 |
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
|
| 120 |
+
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
| 121 |
+
|
| 122 |
+
if (i0 >= ne0) {
|
| 123 |
+
return;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
| 127 |
+
|
| 128 |
+
if (i0 >= n_dims) {
|
| 129 |
+
const int i = row*ne0 + i0;
|
| 130 |
+
|
| 131 |
+
dst[i + 0] = x[i + 0];
|
| 132 |
+
dst[i + 1] = x[i + 1];
|
| 133 |
+
|
| 134 |
+
return;
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
const int i = row*ne0 + i0/2;
|
| 138 |
+
const int i2 = row/p_delta_rows;
|
| 139 |
+
|
| 140 |
+
int sect_dims = sections.v[0] + sections.v[1] + sections.v[2] + sections.v[3];
|
| 141 |
+
int sec_w = sections.v[1] + sections.v[0];
|
| 142 |
+
int sector = (i0 / 2) % sect_dims;
|
| 143 |
+
|
| 144 |
+
float theta_base = 0.0;
|
| 145 |
+
if (sector < sections.v[0]) {
|
| 146 |
+
theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
| 147 |
+
}
|
| 148 |
+
else if (sector >= sections.v[0] && sector < sec_w) {
|
| 149 |
+
theta_base = pos[i2 + ne2 * 1]*powf(theta_scale, i0/2.0f);
|
| 150 |
+
}
|
| 151 |
+
else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
|
| 152 |
+
theta_base = pos[i2 + ne2 * 2]*powf(theta_scale, i0/2.0f);
|
| 153 |
+
}
|
| 154 |
+
else if (sector >= sec_w + sections.v[2]) {
|
| 155 |
+
theta_base = pos[i2 + ne2 * 3]*powf(theta_scale, i0/2.0f);
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
| 159 |
+
|
| 160 |
+
float cos_theta;
|
| 161 |
+
float sin_theta;
|
| 162 |
+
|
| 163 |
+
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
| 164 |
+
|
| 165 |
+
const float x0 = x[i + 0];
|
| 166 |
+
const float x1 = x[i + n_dims/2];
|
| 167 |
+
|
| 168 |
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
| 169 |
+
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
template<typename T, bool has_ff>
|
| 173 |
+
static __global__ void rope_vision(
|
| 174 |
+
const T * x, T * dst, int ne0, int ne2, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 175 |
+
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors, mrope_sections sections) {
|
| 176 |
+
const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
| 177 |
+
|
| 178 |
+
if (i0 >= ne0) {
|
| 179 |
+
return;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
| 183 |
+
|
| 184 |
+
const int i = row*ne0 + i0/2;
|
| 185 |
+
const int i2 = row/p_delta_rows; // i2-th tokens
|
| 186 |
+
|
| 187 |
+
int sect_dims = sections.v[0] + sections.v[1];
|
| 188 |
+
int sec_w = sections.v[1] + sections.v[0];
|
| 189 |
+
int sector = (i0 / 2) % sect_dims;
|
| 190 |
+
|
| 191 |
+
float theta_base = 0.0;
|
| 192 |
+
if (sector < sections.v[0]) {
|
| 193 |
+
const int p = sector;
|
| 194 |
+
theta_base = pos[i2]*powf(theta_scale, p);
|
| 195 |
+
}
|
| 196 |
+
else if (sector >= sections.v[0] && sector < sec_w) {
|
| 197 |
+
const int p = sector - sections.v[0];
|
| 198 |
+
theta_base = pos[i2 + ne2]*powf(theta_scale, p);
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
| 202 |
+
|
| 203 |
+
float cos_theta;
|
| 204 |
+
float sin_theta;
|
| 205 |
+
|
| 206 |
+
rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
| 207 |
+
|
| 208 |
+
const float x0 = x[i + 0];
|
| 209 |
+
const float x1 = x[i + n_dims];
|
| 210 |
+
|
| 211 |
+
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
| 212 |
+
dst[i + n_dims] = x0*sin_theta + x1*cos_theta;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
template<typename T>
|
| 216 |
static void rope_norm_cuda(
|
| 217 |
const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
|
|
|
| 260 |
}
|
| 261 |
}
|
| 262 |
|
| 263 |
+
template<typename T>
|
| 264 |
+
static void rope_multi_cuda(
|
| 265 |
+
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 266 |
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
|
| 267 |
+
GGML_ASSERT(ne0 % 2 == 0);
|
| 268 |
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
| 269 |
+
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
| 270 |
+
const dim3 block_nums(nr, n_blocks_x, 1);
|
| 271 |
+
|
| 272 |
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 273 |
+
|
| 274 |
+
if (freq_factors == nullptr) {
|
| 275 |
+
rope_multi<T, false><<<block_nums, block_dims, 0, stream>>>(
|
| 276 |
+
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
| 277 |
+
theta_scale, freq_factors, sections
|
| 278 |
+
);
|
| 279 |
+
} else {
|
| 280 |
+
rope_multi<T, true><<<block_nums, block_dims, 0, stream>>>(
|
| 281 |
+
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
| 282 |
+
theta_scale, freq_factors, sections
|
| 283 |
+
);
|
| 284 |
+
}
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
template<typename T>
|
| 288 |
+
static void rope_vision_cuda(
|
| 289 |
+
const T * x, T * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 290 |
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream) {
|
| 291 |
+
GGML_ASSERT(ne0 % 2 == 0);
|
| 292 |
+
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
| 293 |
+
const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
| 294 |
+
const dim3 block_nums(nr, n_blocks_x, 1);
|
| 295 |
+
// break down (head_dim, heads, seq) into (CUDA_ROPE_BLOCK_SIZE, x, heads * seq)
|
| 296 |
+
// where x ~= ceil(head_dim / CUDA_ROPE_BLOCK_SIZE);
|
| 297 |
+
|
| 298 |
+
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
| 299 |
+
|
| 300 |
+
if (freq_factors == nullptr) {
|
| 301 |
+
rope_vision<T, false><<<block_nums, block_dims, 0, stream>>>(
|
| 302 |
+
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
| 303 |
+
theta_scale, freq_factors, sections
|
| 304 |
+
);
|
| 305 |
+
} else {
|
| 306 |
+
rope_vision<T, true><<<block_nums, block_dims, 0, stream>>>(
|
| 307 |
+
x, dst, ne0, ne2, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
| 308 |
+
theta_scale, freq_factors, sections
|
| 309 |
+
);
|
| 310 |
+
}
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
static void rope_norm_cuda_f16(
|
| 314 |
const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 315 |
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
|
|
|
|
| 339 |
rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
|
| 340 |
}
|
| 341 |
|
| 342 |
+
static void rope_multi_cuda_f16(
|
| 343 |
+
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 344 |
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
| 345 |
+
) {
|
| 346 |
+
|
| 347 |
+
rope_multi_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
static void rope_multi_cuda_f32(
|
| 351 |
+
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 352 |
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
| 353 |
+
) {
|
| 354 |
+
|
| 355 |
+
rope_multi_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
static void rope_vision_cuda_f16(
|
| 359 |
+
const half * x, half * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 360 |
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
| 361 |
+
) {
|
| 362 |
+
|
| 363 |
+
rope_vision_cuda<half>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
static void rope_vision_cuda_f32(
|
| 367 |
+
const float * x, float * dst, int ne0, int ne2, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
|
| 368 |
+
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, mrope_sections sections, cudaStream_t stream
|
| 369 |
+
) {
|
| 370 |
+
|
| 371 |
+
rope_vision_cuda<float>(x, dst, ne0, ne2, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
| 375 |
const ggml_tensor * src0 = dst->src[0];
|
| 376 |
const ggml_tensor * src1 = dst->src[1];
|
|
|
|
| 387 |
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
| 388 |
GGML_ASSERT(src0->type == dst->type);
|
| 389 |
|
| 390 |
+
const int64_t ne00 = src0->ne[0]; // head dims
|
| 391 |
+
const int64_t ne01 = src0->ne[1]; // num heads
|
| 392 |
+
const int64_t ne02 = src0->ne[2]; // num heads
|
| 393 |
const int64_t nr = ggml_nrows(src0);
|
| 394 |
|
| 395 |
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
|
|
| 397 |
const int mode = ((int32_t *) dst->op_params)[2];
|
| 398 |
//const int n_ctx = ((int32_t *) dst->op_params)[3];
|
| 399 |
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
| 400 |
+
mrope_sections sections;
|
| 401 |
|
| 402 |
// RoPE alteration for extended context
|
| 403 |
float freq_base;
|
|
|
|
| 413 |
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float));
|
| 414 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 415 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
| 416 |
+
memcpy(§ions.v, (int32_t *) dst->op_params + 11, sizeof(int)*4);
|
| 417 |
|
| 418 |
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
| 419 |
+
const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
|
| 420 |
+
const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
|
| 421 |
+
|
| 422 |
+
if (is_mrope) {
|
| 423 |
+
GGML_ASSERT(sections.v[0] > 0 || sections.v[1] > 0 || sections.v[2] > 0);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
if (is_vision) {
|
| 427 |
+
GGML_ASSERT(n_dims == ne00/2);
|
| 428 |
+
}
|
| 429 |
|
| 430 |
const int32_t * pos = (const int32_t *) src1_d;
|
| 431 |
|
|
|
|
| 452 |
} else {
|
| 453 |
GGML_ABORT("fatal error");
|
| 454 |
}
|
| 455 |
+
} else if (is_mrope && !is_vision) {
|
| 456 |
+
if (src0->type == GGML_TYPE_F32) {
|
| 457 |
+
rope_multi_cuda_f32(
|
| 458 |
+
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
| 459 |
+
attn_factor, corr_dims, freq_factors, sections, stream
|
| 460 |
+
);
|
| 461 |
+
} else if (src0->type == GGML_TYPE_F16) {
|
| 462 |
+
rope_multi_cuda_f16(
|
| 463 |
+
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
| 464 |
+
attn_factor, corr_dims, freq_factors, sections, stream
|
| 465 |
+
);
|
| 466 |
+
} else {
|
| 467 |
+
GGML_ABORT("fatal error");
|
| 468 |
+
}
|
| 469 |
+
} else if (is_vision) {
|
| 470 |
+
if (src0->type == GGML_TYPE_F32) {
|
| 471 |
+
rope_vision_cuda_f32(
|
| 472 |
+
(const float *)src0_d, (float *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
| 473 |
+
attn_factor, corr_dims, freq_factors, sections, stream
|
| 474 |
+
);
|
| 475 |
+
} else if (src0->type == GGML_TYPE_F16) {
|
| 476 |
+
rope_vision_cuda_f16(
|
| 477 |
+
(const half *)src0_d, (half *)dst_d, ne00, ne02, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
|
| 478 |
+
attn_factor, corr_dims, freq_factors, sections, stream
|
| 479 |
+
);
|
| 480 |
+
} else {
|
| 481 |
+
GGML_ABORT("fatal error");
|
| 482 |
+
}
|
| 483 |
} else {
|
| 484 |
if (src0->type == GGML_TYPE_F32) {
|
| 485 |
rope_norm_cuda_f32(
|
|
@@ -1419,8 +1419,18 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
|
|
| 1419 |
case GGML_OP_SOFT_MAX:
|
| 1420 |
case GGML_OP_RMS_NORM:
|
| 1421 |
case GGML_OP_NORM:
|
| 1422 |
-
case GGML_OP_ROPE:
|
| 1423 |
return true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1424 |
case GGML_OP_DUP:
|
| 1425 |
case GGML_OP_CPY:
|
| 1426 |
case GGML_OP_CONT:
|
|
|
|
| 1419 |
case GGML_OP_SOFT_MAX:
|
| 1420 |
case GGML_OP_RMS_NORM:
|
| 1421 |
case GGML_OP_NORM:
|
|
|
|
| 1422 |
return true;
|
| 1423 |
+
case GGML_OP_ROPE:
|
| 1424 |
+
{
|
| 1425 |
+
const int mode = ((const int32_t *) op->op_params)[2];
|
| 1426 |
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
| 1427 |
+
return false;
|
| 1428 |
+
}
|
| 1429 |
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
| 1430 |
+
return false;
|
| 1431 |
+
}
|
| 1432 |
+
return true;
|
| 1433 |
+
}
|
| 1434 |
case GGML_OP_DUP:
|
| 1435 |
case GGML_OP_CPY:
|
| 1436 |
case GGML_OP_CONT:
|
|
@@ -1125,8 +1125,18 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
|
|
| 1125 |
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
|
| 1126 |
case GGML_OP_ARGMAX:
|
| 1127 |
case GGML_OP_NORM:
|
| 1128 |
-
case GGML_OP_ROPE:
|
| 1129 |
return true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1130 |
case GGML_OP_IM2COL:
|
| 1131 |
return op->src[0]->type == GGML_TYPE_F16;
|
| 1132 |
case GGML_OP_POOL_1D:
|
|
@@ -3026,7 +3036,9 @@ static void ggml_metal_encode_node(
|
|
| 3026 |
} break;
|
| 3027 |
case GGML_OP_ROPE:
|
| 3028 |
{
|
| 3029 |
-
|
|
|
|
|
|
|
| 3030 |
|
| 3031 |
const int nth = MIN(1024, ne00);
|
| 3032 |
|
|
|
|
| 1125 |
return has_simdgroup_reduction && (op->ne[0] % 4 == 0);
|
| 1126 |
case GGML_OP_ARGMAX:
|
| 1127 |
case GGML_OP_NORM:
|
|
|
|
| 1128 |
return true;
|
| 1129 |
+
case GGML_OP_ROPE:
|
| 1130 |
+
{
|
| 1131 |
+
const int mode = ((const int32_t *) op->op_params)[2];
|
| 1132 |
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
| 1133 |
+
return false;
|
| 1134 |
+
}
|
| 1135 |
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
| 1136 |
+
return false;
|
| 1137 |
+
}
|
| 1138 |
+
return true;
|
| 1139 |
+
}
|
| 1140 |
case GGML_OP_IM2COL:
|
| 1141 |
return op->src[0]->type == GGML_TYPE_F16;
|
| 1142 |
case GGML_OP_POOL_1D:
|
|
|
|
| 3036 |
} break;
|
| 3037 |
case GGML_OP_ROPE:
|
| 3038 |
{
|
| 3039 |
+
// make sure we have one or more position id(ne10) per token(ne02)
|
| 3040 |
+
GGML_ASSERT(ne10 % ne02 == 0);
|
| 3041 |
+
GGML_ASSERT(ne10 >= ne02);
|
| 3042 |
|
| 3043 |
const int nth = MIN(1024, ne00);
|
| 3044 |
|
|
@@ -4488,7 +4488,16 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 4488 |
case GGML_OP_SOFT_MAX:
|
| 4489 |
return true;
|
| 4490 |
case GGML_OP_ROPE:
|
| 4491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4492 |
case GGML_OP_IM2COL:
|
| 4493 |
// TODO: add support for the new F32 operations
|
| 4494 |
return op->src[0]->type == GGML_TYPE_F16;
|
|
|
|
| 4488 |
case GGML_OP_SOFT_MAX:
|
| 4489 |
return true;
|
| 4490 |
case GGML_OP_ROPE:
|
| 4491 |
+
{
|
| 4492 |
+
const int mode = ((const int32_t *) op->op_params)[2];
|
| 4493 |
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
| 4494 |
+
return false;
|
| 4495 |
+
}
|
| 4496 |
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
| 4497 |
+
return false;
|
| 4498 |
+
}
|
| 4499 |
+
return ggml_is_contiguous(op->src[0]);
|
| 4500 |
+
}
|
| 4501 |
case GGML_OP_IM2COL:
|
| 4502 |
// TODO: add support for the new F32 operations
|
| 4503 |
return op->src[0]->type == GGML_TYPE_F16;
|
|
@@ -7687,7 +7687,16 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
| 7687 |
case GGML_OP_REPEAT:
|
| 7688 |
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
|
| 7689 |
case GGML_OP_ROPE:
|
| 7690 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7691 |
case GGML_OP_NONE:
|
| 7692 |
case GGML_OP_RESHAPE:
|
| 7693 |
case GGML_OP_VIEW:
|
|
|
|
| 7687 |
case GGML_OP_REPEAT:
|
| 7688 |
return ggml_type_size(op->type) == sizeof(float) && ggml_type_size(op->src[0]->type) == sizeof(float);
|
| 7689 |
case GGML_OP_ROPE:
|
| 7690 |
+
{
|
| 7691 |
+
const int mode = ((const int32_t *) op->op_params)[2];
|
| 7692 |
+
if (mode & GGML_ROPE_TYPE_MROPE) {
|
| 7693 |
+
return false;
|
| 7694 |
+
}
|
| 7695 |
+
if (mode & GGML_ROPE_TYPE_VISION) {
|
| 7696 |
+
return false;
|
| 7697 |
+
}
|
| 7698 |
+
return ggml_is_contiguous(op->src[0]);
|
| 7699 |
+
}
|
| 7700 |
case GGML_OP_NONE:
|
| 7701 |
case GGML_OP_RESHAPE:
|
| 7702 |
case GGML_OP_VIEW:
|
|
@@ -3517,15 +3517,18 @@ static struct ggml_tensor * ggml_rope_impl(
|
|
| 3517 |
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
| 3518 |
}
|
| 3519 |
|
|
|
|
|
|
|
| 3520 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 3521 |
|
| 3522 |
-
int32_t params[
|
| 3523 |
memcpy(params + 5, &freq_base, sizeof(float));
|
| 3524 |
memcpy(params + 6, &freq_scale, sizeof(float));
|
| 3525 |
memcpy(params + 7, &ext_factor, sizeof(float));
|
| 3526 |
memcpy(params + 8, &attn_factor, sizeof(float));
|
| 3527 |
memcpy(params + 9, &beta_fast, sizeof(float));
|
| 3528 |
memcpy(params + 10, &beta_slow, sizeof(float));
|
|
|
|
| 3529 |
ggml_set_op_params(result, params, sizeof(params));
|
| 3530 |
|
| 3531 |
result->op = GGML_OP_ROPE;
|
|
@@ -3547,6 +3550,53 @@ struct ggml_tensor * ggml_rope(
|
|
| 3547 |
);
|
| 3548 |
}
|
| 3549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3550 |
struct ggml_tensor * ggml_rope_inplace(
|
| 3551 |
struct ggml_context * ctx,
|
| 3552 |
struct ggml_tensor * a,
|
|
|
|
| 3517 |
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
| 3518 |
}
|
| 3519 |
|
| 3520 |
+
int sections[4] = {0, 0, 0, 0};
|
| 3521 |
+
|
| 3522 |
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
| 3523 |
|
| 3524 |
+
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
| 3525 |
memcpy(params + 5, &freq_base, sizeof(float));
|
| 3526 |
memcpy(params + 6, &freq_scale, sizeof(float));
|
| 3527 |
memcpy(params + 7, &ext_factor, sizeof(float));
|
| 3528 |
memcpy(params + 8, &attn_factor, sizeof(float));
|
| 3529 |
memcpy(params + 9, &beta_fast, sizeof(float));
|
| 3530 |
memcpy(params + 10, &beta_slow, sizeof(float));
|
| 3531 |
+
memcpy(params + 11, §ions, sizeof(int)*4);
|
| 3532 |
ggml_set_op_params(result, params, sizeof(params));
|
| 3533 |
|
| 3534 |
result->op = GGML_OP_ROPE;
|
|
|
|
| 3550 |
);
|
| 3551 |
}
|
| 3552 |
|
| 3553 |
+
struct ggml_tensor * ggml_rope_multi(
|
| 3554 |
+
struct ggml_context * ctx,
|
| 3555 |
+
struct ggml_tensor * a,
|
| 3556 |
+
struct ggml_tensor * b,
|
| 3557 |
+
struct ggml_tensor * c,
|
| 3558 |
+
int n_dims,
|
| 3559 |
+
int sections[4],
|
| 3560 |
+
int mode,
|
| 3561 |
+
int n_ctx_orig,
|
| 3562 |
+
float freq_base,
|
| 3563 |
+
float freq_scale,
|
| 3564 |
+
float ext_factor,
|
| 3565 |
+
float attn_factor,
|
| 3566 |
+
float beta_fast,
|
| 3567 |
+
float beta_slow) {
|
| 3568 |
+
// Multimodal Rotary Position Embedding
|
| 3569 |
+
GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
| 3570 |
+
|
| 3571 |
+
GGML_ASSERT(ggml_is_vector(b));
|
| 3572 |
+
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
| 3573 |
+
GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
| 3574 |
+
|
| 3575 |
+
if (c) {
|
| 3576 |
+
GGML_ASSERT(c->type == GGML_TYPE_F32);
|
| 3577 |
+
GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
| 3578 |
+
}
|
| 3579 |
+
|
| 3580 |
+
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
| 3581 |
+
|
| 3582 |
+
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
| 3583 |
+
memcpy(params + 5, &freq_base, sizeof(float));
|
| 3584 |
+
memcpy(params + 6, &freq_scale, sizeof(float));
|
| 3585 |
+
memcpy(params + 7, &ext_factor, sizeof(float));
|
| 3586 |
+
memcpy(params + 8, &attn_factor, sizeof(float));
|
| 3587 |
+
memcpy(params + 9, &beta_fast, sizeof(float));
|
| 3588 |
+
memcpy(params + 10, &beta_slow, sizeof(float));
|
| 3589 |
+
memcpy(¶ms[11], sections, sizeof(int)*4);
|
| 3590 |
+
ggml_set_op_params(result, params, sizeof(params));
|
| 3591 |
+
|
| 3592 |
+
result->op = GGML_OP_ROPE;
|
| 3593 |
+
result->src[0] = a;
|
| 3594 |
+
result->src[1] = b;
|
| 3595 |
+
result->src[2] = c;
|
| 3596 |
+
|
| 3597 |
+
return result;
|
| 3598 |
+
}
|
| 3599 |
+
|
| 3600 |
struct ggml_tensor * ggml_rope_inplace(
|
| 3601 |
struct ggml_context * ctx,
|
| 3602 |
struct ggml_tensor * a,
|